# Load packages
library(ggtree)
library(Biostrings)
library(tidyverse)
library(phyloseq)
library(ggClusterNet)
library(EasyStat)
library(fs)
library(ggthemes)
library(RColorBrewer)
library(magrittr)
library(MicrobiotaProcess)
library(ggsignif)
library(ggtree)
library(ggtreeExtra)
library(ggstar)
library(MicrobiotaProcess)
library(ggnewscale)
library(grid)
# Reading the raw file (原始文件读取)
metadata = read.delim("data/metadata.tsv")
row.names(metadata) = metadata$SampleID
otutab = read.table("data/otutab.txt", header=T, row.names=1, sep="\t",
comment.char="", stringsAsFactors = F)
taxonomy = read.table("data/taxonomy.txt", header=T, row.names=1, sep="\t",
comment.char="", stringsAsFactors = F)
# Load packages
library(reshape2)
library(vegan)
library(ggplot2)
library(ggpubr)
library(patchwork)
library(ape)
library(tidyr)
library(MMUPHin) # meta-analysis for microbiome data
# if (!require("BiocManager", quietly = TRUE))
# install.packages("BiocManager")
# BiocManager::install("MMUPHin")
library(magrittr)
library(dplyr)
library(scales)
library(multcompView)
library(ggsignif)
library(amplicon)
library(ggrepel)
library(rdacca.hp)
library(psych)
library(cowplot)
# Set plot theme
mytheme = theme_bw() + theme(text = element_text(family = "sans", size = 8))+
theme(legend.position="none",
legend.text = element_text(size=10),
legend.title = element_blank(),
panel.background = element_blank(),
panel.grid = element_blank(),
axis.text.y = element_text(size=10, colour="black", family = "sans", angle = 0),
axis.text.x = element_text(size=10, colour="black", family = "sans", angle = 0, hjust = 0),
axis.title= element_text(size=10, family = "sans"),
strip.text.x = element_text(size=10, angle = 0),
strip.text.y = element_text(size=10, angle = 0),
plot.title = element_text(size=10, angle = 0),
strip.background.x = element_rect(fill = "#E5E4E2", colour = "black", size = 0.2))+
theme(axis.text.x=element_text(angle=0,vjust=1, hjust=0.6))+
theme(axis.line = element_line(size = 0.1, colour = "black"))
#### α多样性(alpha diversity) ####
# 宏基因组metaphlan4结果(metaphlan4 results from metagenomic analysis)
# otutable <- read.table("data/taxonomy.tsv",header=T,sep='\t',stringsAsFactors = F)
# otutable <- separate(otutable, clade_name, c("Kingdom","Phylum","Class","Order","Family","Genus","Species","Taxonomy"),sep="\\|",extra = "drop", fill = "right")
# otutable <- otutable[-which(is.na(otutable$Taxonomy)),]
# 扩增子或kraken2结果(amplcon or metagenome kraken2 results)
# metadata = read.delim("data/metadata.tsv")
# row.names(metadata) = metadata$SampleID
otutab = read.table("data/otutab.txt", header=T, row.names=1, sep="\t", comment.char="", stringsAsFactors = F)
otutab$Taxonomy <- rownames(otutab)
taxonomy = read.table("data/taxonomy.txt", header=T, row.names=1, sep="\t", comment.char="", stringsAsFactors = F)
taxonomy$Taxonomy <- rownames(taxonomy)
otutable <- merge(taxonomy, otutab, by = "Taxonomy")
otutable <- otutable[, c(2:8, 1, 9:26)]
otutable = data.frame(otutable,stringsAsFactors = F)
otutable[,9:ncol(otutable)] = as.data.frame(lapply(otutable[,9:ncol(otutable)],as.numeric))
#metadata = read.table(paste("data/group_new779.txt",sep=""), header=T, row.names=1, sep="\t", comment.char="")
metadata = read.delim("data/metadata.tsv")
#metadata$Group <- NULL
colnames(metadata)[2] <- 'Group'
# 宏基因组测序在Species水平计算alpha多样性(Calculating alpha diversity at the species level using metagenomic sequencing)
# i = 7
# 扩增子测序在Genus水平计算alpha多样性(Amplicon sequencing to calculate alpha diversity at the genus level)
i = 6
level = cbind(otutable[,i],otutable[,9:ncol(otutable)])
level = melt(level,id.vars= colnames(level)[1],
measure.vars = colnames(level[,2:ncol(level)]),
variable.name = "sample",value.name = "relative_abundance")
level = dcast(level, otutable[, i] ~ sample, fun.aggregate = sum)
# 转置(Transpose)
level = t(level)
colnames(level) = level[1,]
level = level[-1,]
level = data.frame(level,stringsAsFactors = F)
colnames(level) = gsub('\\.','',colnames(level))
level1 = apply(level,2,as.numeric)
rownames(level1)=rownames(level)
level=level1
# 计算diversity(Calculate diversity)
level_diversity = data.frame(Sample_ID = colnames(otutable[9:ncol(otutable)]),
observed_species=specnumber(level),
#chao1 = estimateR(t(otutable[,-c(1:7)]))[2,], # kraken2 results or amplicon results
shannon=vegan::diversity(level, index="shannon"),
simpson=vegan::diversity(level, index="simpson"),
invsimpson=vegan::diversity(level, index="invsimpson"),
Pielou_evenness=vegan::diversity(level,
index="shannon")/log(specnumber(level)))
# 输出diversity table(Diversity table output)
write.table(level_diversity, file = paste0('results/Community_diversity_analysis/alpha_diversity/',colnames(otutable)[i],'_alpha_diversity.txt'),
quote = FALSE, sep = "\t", row.names = FALSE,col.names = TRUE)
Alpha diversity index plots for two group compare: NPC and Healthy controls
# Source and edited from package amplicon
alpha_boxplot2 <- function(alpha_div, metadata, index = "shannon", groupID = "group", levels = c(), outlier = FALSE){
p_list = c("ggplot2", "dplyr", "multcompView")
for (p in p_list) {
if (!requireNamespace(p)) {
install.packages(p)
}
suppressPackageStartupMessages(library(p, character.only = TRUE,
quietly = TRUE, warn.conflicts = FALSE))
}
idx = rownames(metadata) %in% rownames(alpha_div)
metadata = metadata[idx, , drop = F]
alpha_div = alpha_div
idx = rownames(metadata) %in% rownames(alpha_div)
metadata = metadata[idx, , drop = F]
alpha_div = alpha_div[rownames(metadata), ]
sampFile = as.data.frame(metadata[, groupID], row.names = row.names(metadata))
df = cbind(alpha_div[rownames(sampFile), index], sampFile)
colnames(df) = c(index, "group")
max = max(df[, c(index)])
min = min(df[, index])
x = df[, c("group", index)]
y = x %>% group_by(group) %>% summarise_(Max = paste("max(", index, ")", sep = ""))
y = as.data.frame(y)
rownames(y) = y$group
df$y = y[as.character(df$group), ]$Max + (max - min) * 0.05
levels(as.factor(df$group))
df = df %>%
mutate(group = ordered(df$group,levels=levels))
df$class = index
compaired = list(c(levels[1], levels[2]), c(levels[1], levels[3]), c(levels[2], levels[3]))
#wt = wilcox.test(df[[index]] ~ df$group, alternative=c("two.sided"))
#FDR = p.adjust(wt$p.value, method = "BH")
p1 = ggplot(df, aes(x = group, y = .data[[index]])) +
geom_jitter(aes(color=group),position = position_jitter(0.15), size = 0.3, alpha = 1) +
geom_boxplot(position=position_dodge(width =0.4),width=0.5, size = 0.4,
fill = "transparent",
outlier.shape = NA,
linetype = "dashed", color = "black") +
stat_boxplot(aes(ymin=..lower..,ymax=..upper..,
fill=group
),
color="black",
fill = "transparent",position=position_dodge(width =0.4),width=0.5, size = 0.4,outlier.shape = NA)+
stat_boxplot(geom = "errorbar",aes(ymin=..ymax..),
width=0.18,color="black",size = 0.4)+
stat_boxplot(geom = "errorbar",aes(ymax=..ymin..),
width=0.18,color="black",size = 0.4)+
labs(x = NULL, y = NULL, color = groupID) +
scale_y_continuous(labels = label_number(accuracy = 0.1)) +
#scale_fill_manual(values = c("#74add1","#a60026"))+
scale_fill_manual(values = c("#00C0D8","#FF6060", "lightblue"))+
#scale_color_manual(values = c("#74add1","#a60026"))+
scale_color_manual(values = c("#00C0D8","#FF6060","lightblue"))+
geom_signif(comparisons = compaired,
step_increase = 0.3,
map_signif_level = F,
test = wilcox.test,
color = "black",
size = 0.2,
textsize = 3
)+
mytheme+
facet_grid(.~class)
p1
}
# Data
alpha_div <- level_diversity
alpha_div$Sample_ID = sub(".x","", alpha_div$Sample_ID)
rownames(alpha_div) = alpha_div$Sample_ID
metadata2 <- as.data.frame(metadata)
rownames(metadata2) <- metadata2$SampleID
metadata2$Group2 <- metadata2$Group
# Plot
p1 <- alpha_boxplot2(alpha_div, metadata2, index = "shannon", groupID = "Group2", levels = c("KO", "OE", "WT"))
p2 <- alpha_boxplot2(alpha_div, metadata2, index = "invsimpson", groupID = "Group2", levels = c("KO", "OE", "WT"))
p3 <- alpha_boxplot2(alpha_div, metadata2, index = "Pielou_evenness", groupID = "Group2", levels = c("KO", "OE", "WT"))
library(patchwork)
p_all_alpha = p1 + p2 + p3
ggsave(paste("results/Community_diversity_analysis/alpha_diversity/Alpha_diversity01",".pdf", sep=""), p_all_alpha, width=105 * 1.5, height=65 * 1.5, unit='mm')
p_all_alpha
# 定义统计函数
# Define statistical functions
library(vegan)
library(picante)
# 读取 OTU 丰度表
# Read OTU abundance table
otu <- read.table("data/otutab.txt", header=T, row.names=1, sep="\t",
comment.char="", stringsAsFactors = F)
otu <- t(otu)
##定义函数(Define functions)
#计算多种 Alpha 多样性指数,结果返回至向量
#Calculate multiple Alpha diversity indices and return the results to a vector
alpha_index <- function(x, method = 'richness', tree = NULL, base = exp(1)) {
if (method == 'richness') result <- rowSums(x > 0)
else if (method == 'chao1') result <- estimateR(x)[2, ]
else if (method == 'ace') result <- estimateR(x)[4, ]
else if (method == 'shannon') result <- diversity(x, index = 'shannon',
base = base)
else if (method == 'simpson') result <- diversity(x, index = 'simpson')
#Pielou 均匀度(Pielou evenness)
else if (method == 'pielou') result <-
diversity(x, index = 'shannon',base = base) / log(estimateR(x)[1, ], base)
else if (method == 'gc') result <- 1 - rowSums(x == 1) / rowSums(x) #goods_coverage
else if (method == 'pd' & !is.null(tree)) { #PD_whole_tree
pd <- pd(x, tree, include.root = FALSE)
result <- pd[ ,1]
names(result) <- rownames(pd)
}
result
}
# 根据抽样步长(step),统计每个稀释梯度下的 Alpha 多样性指数,结果返回至列表
# According to the sampling step, count the Alpha diversity index under each dilution gradient, and return the results to the list
alpha_curves <- function(x, step, method = 'richness', rare = NULL, tree = NULL, base = exp(1)) {
x_nrow <- nrow(x)
if (is.null(rare)) rare <- rowSums(x) else rare <- rep(rare, x_nrow)
alpha_rare <- list()
for (i in 1:x_nrow) {
step_num <- seq(0, rare[i], step)
if (max(step_num) < rare[i]) step_num <- c(step_num, rare[i])
alpha_rare_i <- NULL
for (step_num_n in step_num) alpha_rare_i <-
c(alpha_rare_i, alpha_index(x = rrarefy(x[i, ], step_num_n),
method = method, tree = tree, base = base))
names(alpha_rare_i) <- step_num
alpha_rare <- c(alpha_rare, list(alpha_rare_i))
}
names(alpha_rare) <- rownames(x)
alpha_rare
}
## 测试(Test)
# 统计 OTU 丰度表中各样本的 Shannon 指数,对数底数使用 e
# Count the Shannon index of each sample in the OTU abundance table, and use e as the logarithmic base
shannon_index <- alpha_index(otu, method = 'shannon', base = exp(1))
# 以 1000 条序列为抽样步长,依次对 OTU 表稀释抽样,直到最大序列深度;并统计各抽样梯度下的 OTU 丰度表中各样本的 Shannon 指数,对数底数使用 e
# Using 1000 sequences as the sampling step, the OTU table was diluted and sampled in sequence until the maximum sequence depth was reached; and the Shannon index of each sample in the OTU abundance table under each sampling gradient was calculated, and the logarithmic base was e
shannon_curves <- alpha_curves(otu, step = 1000, method = 'shannon', base = exp(1))
# shannon_curves
# 以 2000 条序列为一抽样深度(步长)
# Take 2000 sequences as a sampling depth (step length)
rarecurve(otu, step = 2000, col = c('red', 'green', 'blue', 'orange', 'purple', 'black'))
## Richness指数曲线(Richness index curve)
# 以下以物种丰富度指数为例绘制 Alpha 多样性曲线(The following is an example of drawing the Alpha diversity curve using the species richness index as an example)
# 以 2000 步长(step=2000)为例统计(Take 2000 steps (step=2000) as an example)
richness_curves <- alpha_curves(otu, step = 2000, method = 'richness')
# 获得 ggplot2作图文件(Plot)
plot_richness <- data.frame()
for (i in names(richness_curves)) {
richness_curves_i <- (richness_curves[[i]])
richness_curves_i <- data.frame(rare = names(richness_curves_i), alpha = richness_curves_i, sample = i, stringsAsFactors = FALSE)
plot_richness <- rbind(plot_richness, richness_curves_i)
}
rownames(plot_richness) <- NULL
plot_richness$rare <- as.numeric(plot_richness$rare)
plot_richness$alpha <- as.numeric(plot_richness$alpha)
# ggplot2
library(ggplot2)
p1 <- ggplot(plot_richness, aes(rare, alpha, color = sample)) +
geom_line() +
labs(x = 'Number of sequences', y = 'Richness', color = NULL) +
theme(panel.grid = element_blank(),
panel.background = element_rect(fill = 'transparent', color = 'black'),
legend.key = element_rect(fill = 'transparent')) +
geom_vline(xintercept = min(rowSums(otu)), linetype = 2) +
scale_x_continuous(breaks = seq(0, 30000, 5000), labels = as.character(seq(0, 30000, 5000)))
ggsave(paste("results/Community_diversity_analysis/alpha_diversity/Alpha_rarefaction_curve01",".pdf", sep=""), p1, width=105 * 1.5, height=65 * 1.5, unit='mm')
## 多计算几次以获取均值 ± 标准差(Calculate several times to get the mean ± standard deviation)
# 重复抽样 5 次(Repeat sampling 5 times)
plot_richness <- data.frame()
for (n in 1:5) {
richness_curves <- alpha_curves(otu, step = 2000, method = 'richness')
for (i in names(richness_curves)) {
richness_curves_i <- (richness_curves[[i]])
richness_curves_i <- data.frame(rare = names(richness_curves_i), alpha = richness_curves_i, sample = i, stringsAsFactors = FALSE)
plot_richness <- rbind(plot_richness, richness_curves_i)
}
}
# 计算均值 ± 标准差(doBy 包中的 summaryBy() 函数)(Calculate mean ± standard deviation (summaryBy() function in doBy package))
library(doBy)
plot_richness_stat <- summaryBy(alpha~sample+rare, plot_richness, FUN = c(mean, sd))
plot_richness_stat$rare <- as.numeric(plot_richness_stat$rare)
plot_richness_stat[which(plot_richness_stat$rare == 0),'alpha.sd'] <- NA
# ggplot2
p2 <- ggplot(plot_richness_stat, aes(rare, alpha.mean, color = sample)) +
geom_line() +
geom_point() + geom_errorbar(aes(ymin = alpha.mean - alpha.sd, ymax = alpha.mean + alpha.sd), width = 500) +
labs(x = 'Number of sequences', y = 'Richness', color = NULL) +
theme(panel.grid = element_blank(), panel.background = element_rect(fill = 'transparent', color = 'black'),
legend.key = element_rect(fill = 'transparent')) +
geom_vline(xintercept = min(rowSums(otu)), linetype = 2) +
scale_x_continuous(breaks = seq(0, 30000, 5000), labels = as.character(seq(0, 30000, 5000)))
ggsave(paste("results/Community_diversity_analysis/alpha_diversity/Alpha_rarefaction_curve02",".pdf", sep=""), p2, width=105 * 1.5, height=65 * 1.5, unit='mm')
## 其它Alpha多样性指数曲线(Other Alpha Diversity Index Curves)
## Shannon指数曲线(Shannon index curve)
# 若简单的“geom_line()”样式波动幅度过大,不平滑等,可以尝试拟合曲线的样式
# If the simple "geom_line()" style fluctuates too much, is not smooth, etc., you can try the style of fitting the curve
# 获得作图数据。前面多生成一个点,目的使 Shannon 拟合曲线更加平滑
# Get the plotting data. Generate one more point in the front to make the Shannon fitting curve smoother
shannon_curves1 <- alpha_curves(otu, step = 200, rare = 200, method = 'shannon')
shannon_curves2 <- alpha_curves(otu, step = 2000, method = 'shannon')
shannon_curves <- c(shannon_curves1, shannon_curves2)
plot_shannon <- data.frame()
for (i in 1:length(shannon_curves)) {
shannon_curves_i <- shannon_curves[[i]]
shannon_curves_i <- data.frame(rare = names(shannon_curves_i), alpha = shannon_curves_i, sample = names(shannon_curves)[i], stringsAsFactors = FALSE)
plot_shannon <- rbind(plot_shannon, shannon_curves_i)
}
rownames(plot_shannon) <- NULL
plot_shannon$rare <- as.numeric(plot_shannon$rare)
plot_shannon$alpha <- as.numeric(plot_shannon$alpha)
plot_shannon <- plot_shannon[order(plot_shannon$sample, plot_shannon$rare), ]
# ggplot2
library(ggalt)
p3 <- ggplot(plot_shannon, aes(rare, alpha, color = sample)) +
geom_xspline() +
labs(x = 'Number of sequences', y = 'Shannon', color = NULL) +
theme(panel.grid = element_blank(), panel.background = element_rect(fill = 'transparent', color = 'black'),
legend.key = element_rect(fill = 'transparent')) +
geom_vline(xintercept = min(rowSums(otu)), linetype = 2) +
scale_x_continuous(breaks = seq(0, 30000, 5000), labels = as.character(seq(0, 30000, 5000)))
ggsave(paste("results/Community_diversity_analysis/alpha_diversity/Alpha_rarefaction_curve03",".pdf", sep=""), p3, width=105 * 1.5, height=65 * 1.5, unit='mm')
## PD_whole_tree曲线(PD_whole_tree curve)
# 对于 PD_whole_tree,除了 OTU 丰度表,还使用到进化树文件
# For PD_whole_tree, in addition to the OTU abundance table, the phylogenetic tree file is also used
# 加载 OTU 丰度表和进化树文件(Load OTU abundance table and phylogenetic tree files)
otu <- read.table("data/otutab.txt", header=T, row.names=1, sep="\t", comment.char="", stringsAsFactors = F)
otu <- t(otu)
#tree <- read.tree('otu_tree.tre')
# 以2000步长(step=2000)为例统计(Take 2000 steps (step=2000) as an example)
pd_curves <- alpha_curves(otu, tree = tree, step = 2000, method = 'pd')
library(ggtree)
library(ape)
tree <- read.tree("data/otus.tree")
# 以 2000 步长(step=2000)为例统计(Take 2000 steps (step=2000) as an example)
pd_curves <- alpha_curves(otu, tree = tree, step = 2000, method = 'pd')
plot_pd <- data.frame()
for (i in 1:length(pd_curves)) {
pd_curves_i <- pd_curves[[i]]
pd_curves_i <- data.frame(rare = names(pd_curves_i), alpha = pd_curves_i, sample = names(pd_curves)[i], stringsAsFactors = FALSE)
plot_pd <- rbind(plot_pd, pd_curves_i)
}
rownames(plot_pd) <- NULL
plot_pd$rare <- as.numeric(plot_pd$rare)
plot_pd$alpha <- as.numeric(plot_pd$alpha)
plot_pd <- plot_pd[order(plot_pd$sample, plot_pd$rare), ]
# ggplot2
library(ggalt)
p4 <- ggplot(plot_pd, aes(rare, alpha, color = sample)) +
geom_xspline() +
labs(x = 'Number of sequences', y = 'PD_whole_tree', color = NULL) +
theme(panel.grid = element_blank(), panel.background = element_rect(fill = 'transparent', color = 'black'),
legend.key = element_rect(fill = 'transparent')) +
geom_vline(xintercept = min(rowSums(otu)), linetype = 2) +
scale_x_continuous(breaks = seq(0, 30000, 5000), labels = as.character(seq(0, 30000, 5000)))
ggsave(paste("results/Community_diversity_analysis/alpha_diversity/Alpha_rarefaction_curve04",".pdf", sep=""), p4, width=105 * 1.5, height=65 * 1.5, unit='mm')
#### β多样性(beta diversity) ####
# metaphlan4结果(metaphalan4 results using metagenomic analysis)
# otutable <- read.table("data/taxonomy.tsv",header=T,sep='\t',stringsAsFactors = F)
# otutable <- separate(otutable, clade_name, c("Kingdom","Phylum","Class","Order","Family","Genus","Species","Taxonomy"),sep="\\|",extra = "drop", fill = "right")
# otutable <- otutable[-which(is.na(otutable$Taxonomy)),]
# 扩增子或kraken2结果(amplicon or kraken2 results)
#metadata = read.delim("data/metadata.tsv")
#row.names(metadata) = metadata$SampleID
otutab = read.table("data/otutab.txt", header=T, row.names=1, sep="\t", comment.char="", stringsAsFactors = F)
otutab$Taxonomy <- rownames(otutab)
taxonomy = read.table("data/taxonomy.txt", header=T, row.names=1, sep="\t", comment.char="", stringsAsFactors = F)
taxonomy = taxonomy[, 1:7]
taxonomy$Taxonomy <- rownames(taxonomy)
otutable <- merge(taxonomy, otutab, by = "Taxonomy")
otutable <- otutable[, c(2:8, 1, 9:26)]
otutable = data.frame(otutable,stringsAsFactors = F)
otutable[,9:ncol(otutable)] = as.data.frame(lapply(otutable[,9:ncol(otutable)],as.numeric))
#metadata = read.table(paste("data/group_new779.txt",sep=""), header=T, row.names=1, sep="\t", comment.char="")
metadata = read.delim("data/metadata.tsv")
#metadata$Group <- NULL
colnames(metadata)[2] <- 'Group'
#### β多样性(beta diversity) ####
i=6 #选取Genus水平进行β多样性分析(Select Genus level for β diversity analysis)
level = cbind(otutable[,i],otutable[,9:ncol(otutable)])
level = melt(level,id.vars= colnames(level)[1],
measure.vars = colnames(level[,2:ncol(level)]),
variable.name = "sample",value.name = "relative_abundance")
level = dcast(level, otutable[, i] ~ sample, fun.aggregate = sum)
# 转置(Transpose)
level = t(level)
colnames(level) = level[1,]
level = level[-1,]
level = data.frame(level,stringsAsFactors = F)
colnames(level) = gsub('\\.','',colnames(level))
level1 = apply(level,2,as.numeric)
rownames(level1)=rownames(level)
level=level1
DCA=decorana(level)
# DCA=summary(DCA)
# 看DCA1的Axis lengths,如果大于4.0,选择CCA分析,如果3.0-4.0,RDA和CCA均可,如果小于3.0,RDA更好
# Look at the Axis lengths of DCA1. If it is greater than 4.0, choose CCA analysis. If it is 3.0-4.0, both RDA and CCA are acceptable. If it is less than 3.0, RDA is better.
RA <- otutable
RA[,9:ncol(RA)] <- apply(RA[,9:ncol(RA)],2, function(x) x / sum(x) )
level = cbind(RA[,i],RA[,9:ncol(RA)])
level = melt(level,id.vars= colnames(level)[1],
measure.vars = colnames(level[,2:ncol(level)]),
variable.name = "Sample_ID",value.name = "relative_abundance")
level = dcast(level, RA[, i] ~ Sample_ID, fun.aggregate = sum)
# 转置(Transpose)
level = t(level)
colnames(level) = level[1,]
level = level[-1,]
level = data.frame(level,stringsAsFactors = F)
colnames(level) = gsub('\\.','',colnames(level))
level1 = apply(level,2,as.numeric)
rownames(level1)=rownames(level)
level <- as.data.frame(level1)
level.reset = level
level.reset$Sample_ID <- rownames(level.reset)
metadata$Sample_ID <- metadata$SampleID
level.reset <- merge(metadata, level.reset, by='Sample_ID')
colnames(level.reset)[3] = 'Group2'
rownames(level.reset) = level.reset[,1]
# 生成β多样性距离矩阵,method默认为'bray',即bray curtis法
# Generate beta diversity distance matrix, method defaults to 'bray', i.e. Bray Curtis method
level_distance = vegdist(level.reset[,-c(1:12)])
level_distance = as.matrix(level_distance)
# 输出bray curtis矩阵(Output bray curtis matrix)
write.table(level_distance, file = paste0('results/Community_diversity_analysis/beta_diversity/',colnames(otutable)[i],'_beta_diversity.txt'),
quote = FALSE, sep = "\t", row.names = FALSE,col.names = TRUE)
Beta diversity PCoA Functions to plot pcoa plots
# Source and edited from package amplicon
beta_pcoa2 = function (dis_mat, metadata, groupID = "Group", groupID2 = "Group", levels = c(), ellipse = T,
label = F, PCo = 12)
{
p_list = c("ggplot2", "vegan", "ggrepel")
for (p in p_list) {
if (!requireNamespace(p)) {
install.packages(p)
}
suppressWarnings(suppressMessages(library(p, character.only = T)))
}
idx = rownames(metadata) %in% rownames(dis_mat)
metadata = metadata[idx, , drop = F]
dis_mat = dis_mat[rownames(metadata), rownames(metadata)]
sampFile = as.data.frame(metadata[, groupID], row.names = row.names(metadata))
sampFile2 = as.data.frame(metadata[, groupID2], row.names = row.names(metadata))
pcoa = cmdscale(dis_mat, k = 3, eig = T)
points = as.data.frame(pcoa$points)
eig = pcoa$eig
points = cbind(points, sampFile[rownames(points), ])
points = cbind(points, sampFile2[rownames(points), ])
colnames(points) = c("x", "y", "z", "group", "group2")
points$group2 = metadata$Group
points$x = points$x
#points$y = -points$y
levels(as.factor(points$group))
points = points %>%
mutate(group = ordered(points$group,
#levels=c("NPC", "Control")
levels = levels
))
if (PCo == 12) {
p = ggplot(points, aes(x = x, y = y, color = group, shape = group2)) + #, shape = group2
labs(x = paste("PCo axis 1 (", format(100 * eig[1]/sum(eig),
digits = 4), "%)", sep = ""), y = paste("PCo axis 2 (",
format(100 * eig[2]/sum(eig), digits = 4), "%)",
sep = ""), color = groupID)
}
if (PCo == 13) {
p = ggplot(points, aes(x = x, y = z, color = group, shape = group2)) +
labs(x = paste("PCo axis 1 (", format(100 * eig[1]/sum(eig),
digits = 4), "%)", sep = ""), y = paste("PCo axis 3 (",
format(100 * eig[2]/sum(eig), digits = 4), "%)",
sep = ""), color = groupID)
}
if (PCo == 23) {
p = ggplot(points, aes(x = y, y = z, color = group, shape = group2)) +
labs(x = paste("PCo axis 2 (", format(100 * eig[1]/sum(eig),
digits = 4), "%)", sep = ""), y = paste("PCo axis 3 (",
format(100 * eig[2]/sum(eig), digits = 4), "%)",
sep = ""), color = groupID)
}
p = p + geom_point(alpha = 0.7, size = 0.6) + theme_classic() +
theme(text = element_text(family = "sans", size = 7)
)+
scale_fill_manual(values = c("#00C0D8","#FF6060", "lightblue"))+
scale_color_manual(values = c("#00C0D8","#FF6060", "lightblue"))+
scale_shape_manual(values = c(19, 19, 19))#+
coord_fixed(ratio = 1)
if (ellipse == T) {
p = p +
stat_ellipse(data = filter(points, group == levels[1]), aes(group = group), level = 0.78, size = 0.4)+
stat_ellipse(data = filter(points, group == levels[2]), aes(group = group), level = 0.78, size = 0.4)+
stat_ellipse(data = filter(points, group == levels[3]), aes(group = group), level = 0.78, size = 0.4)+
theme(text = element_text(family = "sans", size = 7))+theme(legend.position="none",
legend.text = element_text(size=10),
legend.title = element_blank(),
axis.text.y = element_text(size=10, colour="black", family = "sans", angle = 0),
axis.text.x = element_text(size=10, colour="black", family = "sans", angle = 0, hjust = 0),
axis.title= element_text(size=10)
)+
theme(axis.text.x=element_text(angle=0,vjust=1, hjust=0.6))+
theme(axis.line = element_line(size = 0.2, colour = "black"))
}
if (label == T) {
p = p + geom_text_repel(label = paste(rownames(points)),
colour = "black", size = 2)
}
p
}
# Genus
distance_mat = level_distance
# typeof(distance_mat)
rownames(metadata) <- metadata$SampleID
# metadata2 = t(metadata)
metadata2 = metadata
distance_mat2 = distance_mat[rownames(distance_mat) %in% rownames(metadata2), ]
distance_mat3 = distance_mat2[, colnames(distance_mat2) %in% rownames(metadata2)]
distance_mat = distance_mat3
rows01 = rownames(metadata2)
distance_mat2 = distance_mat[rows01, ]
distance_mat2 = t(distance_mat2)
distance_mat3 = distance_mat2[rows01, ]
distance_mat = t(distance_mat3)
# Plotting Constrained PCoA based on distance matrix
level_distance <- distance_mat
pcoa = cmdscale (level_distance,eig=TRUE,k =3)
# cmdscale函数是一个用于计算多维尺度变换的函数(The cmdscale function is a function used to calculate multidimensional scaling transformations.)
level.pcoa = pcoa$points[,c(1,2)]
colnames(level.pcoa)=c('PC1','PC2')
pc = round(pcoa$eig/sum(pcoa$eig)*100,digits=2)
level.pcoa = as.data.frame(level.pcoa)
level.pcoa$Sample_ID = row.names(level.pcoa)
level.pcoa = merge(metadata,level.pcoa,by='Sample_ID')
colnames(level.pcoa)[3] = 'Group2'
level.pcoa$Group2 = factor(level.pcoa$Group2,levels = c('KO','OE','WT'))
# 差异检验(Difference test)
# anosim,一般用于NMDS(anosim, generally used for NMDS)
# level.anosim = anosim(level_distance,level.pcoa[,3],permutations = 999)
# adonis,一般用于PCoA(adonis, generally used for PCoA)
level.adonis = adonis2(level_distance~level.pcoa[,3],data=level.pcoa[,c(13,14)],distance = "bray",permutations = 999)
level.adonis
Permutation test for adonis under reduced model
Terms added sequentially (first to last)
Permutation: free
Number of permutations: 999
adonis2(formula = level_distance ~ level.pcoa[, 3], data = level.pcoa[, c(13, 14)], permutations = 999, distance = "bray")
Df SumOfSqs R2 F Pr(>F)
level.pcoa[, 3] 2 0.061682 0.27855 2.8957 0.001 ***
Residual 15 0.159757 0.72145
Total 17 0.221439 1.00000
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(p_beta_s = beta_pcoa2(distance_mat, metadata2, groupID = "Group",groupID2 = "Group",levels = c('KO','OE','WT'), ellipse = T))
p_beta_s = p_beta_s + annotate(geom = "text", x = -0.19, y = 0.15, label = "R2 = 0.27855, P = 0.002", hjust = "left", size = 3)+
ggtitle("Species")+theme(plot.title = element_text(face="bold", size = 10))
ggsave(paste("results/Community_diversity_analysis/beta_diversity/beta_diversity_genus",".pdf", sep=""), p_beta_s, width=89 * 1.5, height=70 * 1.5, unit='mm')
p_beta_s
# rm(list = ls())
# Load packages
# 载入软件包
library(MicrobiotaProcess)
library(dplyr)
library(ggplot2)
library(phyloseq)
library(ggtree)
library(ggtreeExtra)
library(ggstar)
library(forcats)
library(conflicted)
conflict_prefer("filter", "dplyr")
conflict_prefer("select", "dplyr")
conflict_scout()
conflicted::conflicts_prefer(phyloseq::tax_table)
# 导入数据
# load data
sample <- read.table("data/sample.txt",check.names = F, row.names = 1, header = 1, sep = "\t")
OTU<- read.table("data/otu.txt",check.names = F, row.names = 1, header = 1, sep = "\t")
Tax <- read.table("data/tax.txt",check.names = F, row.names = 1, header = 1, sep = "\t")
# 利用phyloseq包重新构造可转换为分析的数据格式
# Reconstructing data formats that can be converted into analysis using the photoseq package
ps <- phyloseq(sample_data(sample),
otu_table(as.matrix(OTU), taxa_are_rows=TRUE),
tax_table(as.matrix(Tax)))
# 转换数据格式
# Convert data format
df <- ps %>% as.MPSE()
# 物种相对丰度计算
# Calculation of relative species abundance
df %<>%
mp_cal_abundance( # for each samples
.abundance = RareAbundance
) %>%
mp_cal_abundance( # for each groups
.abundance=RareAbundance,
.group=group
)
# 物种差异分析
# Species difference analysis
df %<>%
mp_diff_analysis(
.abundance = RelRareAbundanceBySample,
.group = group,
tip.level = "OTU",
force = FALSE,
relative = TRUE,
taxa.class = "all",
first.test.method = "kruskal.test",
first.test.alpha = 0.05,
p.adjust = "fdr",
filter.p = "fdr",
strict = TRUE,
fc.method = "generalizedFC",
second.test.method = "wilcox.test",
second.test.alpha = 0.05,
cl.min = 4,
cl.test = TRUE,
subcl.min = 3,
subcl.test = TRUE,
ml.method = "lda",# 'lda' or 'rf'
normalization = 1e+06,
ldascore = 2,#LDA阈值
bootnums = 30,
sample.prop.boot = 0.7,
ci = 0.95,
seed = 123,
type = "species"
)
# 提取结果并基于ggtree等R包进行可视化
# Extract the results and visualize them based on R packages such as ggtree
taxa.tree <- df %>%
mp_extract_tree(type="taxatree")
taxa.tree
'treedata' S4 object'.
...@ phylo:
Phylogenetic tree with 60 tips and 126 internal nodes.
Tip labels:
OTU31, OTU53, OTU9, OTU23, OTU48, OTU21, ...
Node labels:
r__root, k__Bacteria, p__Actinobacteria, p__Bacteroidetes, p__Chloroflexi,
p__Firmicutes, ...
Rooted; no branch lengths.
with the following features available:
'nodeClass', 'nodeDepth', 'RareAbundanceBySample', 'RareAbundanceBygroup',
'LDAupper', 'LDAmean', 'LDAlower', 'Sign_group', 'pvalue', 'fdr'.
# The associated data tibble abstraction: 186 × 13
# The 'node', 'label' and 'isTip' are from the phylo tree.
node label isTip nodeClass nodeDepth RareAbundanceBySample
<dbl> <chr> <lgl> <chr> <dbl> <list>
1 1 OTU31 TRUE OTU 8 <tibble [12 × 4]>
2 2 OTU53 TRUE OTU 8 <tibble [12 × 4]>
3 3 OTU9 TRUE OTU 8 <tibble [12 × 4]>
4 4 OTU23 TRUE OTU 8 <tibble [12 × 4]>
5 5 OTU48 TRUE OTU 8 <tibble [12 × 4]>
6 6 OTU21 TRUE OTU 8 <tibble [12 × 4]>
7 7 OTU57 TRUE OTU 8 <tibble [12 × 4]>
8 8 OTU20 TRUE OTU 8 <tibble [12 × 4]>
9 9 OTU26 TRUE OTU 8 <tibble [12 × 4]>
10 10 OTU25 TRUE OTU 8 <tibble [12 × 4]>
# ℹ 176 more rows
# ℹ 7 more variables: RareAbundanceBygroup <list>, LDAupper <dbl>,
# LDAmean <dbl>, LDAlower <dbl>, Sign_group <chr>, pvalue <dbl>, fdr <dbl>
taxa.tree %>%
select(label, nodeClass, LDAupper, LDAmean, LDAlower, Sign_group, pvalue, fdr) %>%
dplyr::filter(!is.na(fdr))
# A tibble: 184 × 8
label nodeClass LDAupper LDAmean LDAlower Sign_group pvalue fdr
<chr> <chr> <dbl> <dbl> <dbl> <chr> <dbl> <dbl>
1 OTU31 OTU NA NA NA <NA> 0.00523 0.0446
2 OTU53 OTU NA NA NA <NA> 0.368 0.389
3 OTU9 OTU 4.39 4.33 4.27 C 0.00598 0.0446
4 OTU23 OTU 4.02 3.96 3.88 C 0.00921 0.0446
5 OTU48 OTU 4.48 4.42 4.34 B 0.00537 0.0446
6 OTU21 OTU NA NA NA <NA> 0.0999 0.145
7 OTU57 OTU NA NA NA <NA> 0.368 0.389
8 OTU20 OTU NA NA NA <NA> 0.0229 0.0539
9 OTU26 OTU NA NA NA <NA> 0.0163 0.0539
10 OTU25 OTU NA NA NA <NA> 0.0152 0.0539
# ℹ 174 more rows
p1 <- ggtree(
taxa.tree,
#aes(color = taxa.tree@phylo[["node.label"]]),
layout="radial",
size = 0.5,open.angle=15, branch.length = "none") +
geom_hilight(data = td_filter(nodeClass == "Phylum"),
mapping = aes(node = node, fill = label),alpha = 0.2)
p1
p12 <- p1# + ggnewscale::new_scale_fill()
proportions_df <- read.table("data/ring02.txt",check.names = F, row.names = 1, header = 1, sep = "\t")
proportions_df <- as.data.frame(proportions_df)
library(ggtext)
p2<-gheatmap(p12,
proportions_df,
offset=-0.5, width=0.15,
colnames=FALSE) +
scale_fill_manual(
values = c("#FFB5C5", "#FF7256", "#EE0000","#EBF7FA","#4DBBD5FF","#FFB5C5", "#FF7256", "#EE0000"),
name = "Proportion"
) +
labs(fill="Proportion of detected") +
theme(legend.title = element_markdown())
p2
p3 <- p2# + ggnewscale::new_scale_fill()
status_df <- read.table("data/ring03.txt",check.names = F, row.names = 1, header = 1, sep = "\t")
status_df <- as.data.frame(status_df)
p4 <- gheatmap(p3,
status_df,
offset=0.5, width=0.15,
colnames=FALSE) +
scale_fill_manual(values=c("#EBF7FA","#4DBBD5FF","#FFB5C5", "#FF7256", "#EE0000","#EBF7FA","#4DBBD5FF","#FFB5C5", "#FF7256", "#EE0000"),
name="*bsh* gene") +
theme(legend.title = element_markdown())
p4
p42 <- p4 + ggnewscale::new_scale_fill()
p5 <- p42 + geom_fruit(
geom = geom_col,
mapping = aes(
x = LDAmean,
fill = Sign_group,
subset = !is.na(LDAmean)),
orientation = "y",
offset = 0.3,
pwidth = 0.5,
axis.params = list(axis = "x",
title = "Log10(LDA)",
title.height = 0.01,
title.size = 2,
text.size = 1.8,
vjust = 1),
grid.params = list(linetype = 2))+
geom_tiplab(size=2, offset=5.5)
p5
p6 <- open_tree(p5, 30) %>% rotate_tree(-76)
ggsave(filename = "results/Community_diversity_analysis/taxonomic_tree/Species_Taxonomic_Tree.pdf", plot = p6, width = 10, height = 8, units = "in", dpi = 300)
p6
# install.packages("circlize")
library(circlize) # 载入软件包
library(reshape2)
# 导入或构建数据
# Load data
otutab2 = read.table("data/otutab2.txt", header=T, row.names=1, sep="\t", comment.char="", stringsAsFactors = F)
# Phylum level
otutab_p <- otutab2[, c(2, 8:25)]
# Sum of phylum
otutab_p2 <- aggregate(.~ Phylum, data = otutab_p, sum)
rownames(otutab_p2) = otutab_p2$Phylum
otutab_p2 = otutab_p2[, -1]# 1963 species
otutab_p3 <- t(otutab_p2)
otutab_p3 <- as.data.frame(otutab_p3)
otutab_p3$group <- rownames(otutab_p3)
otutab_p3$group = gsub("[0-9]","", otutab_p3$group)
otutab_p4 <- aggregate(.~ group, data = otutab_p3, mean)
rownames(otutab_p4) <- otutab_p4$group
# 将宽数据转换为长数据
long_data01 <- melt(otutab_p4,
id.vars = "group", # Specify columns that do not need to be converted
variable.name = "bacteria", # New column name, used to store the original column name
value.name = "abundance" # The new list name is used to store the original value
)
chordDiagram(long_data01)
circos.clear()
# Save as pdf file
pdf("results/Community_diversity_analysis/species_composition/species_composition_chord_diagram.pdf", width = 8, height = 8)
chordDiagram(long_data01,
grid.col = c(S1 = "lightcoral", S2 = "coral2", S3 = "coral4",# The color of the ring
E1 = "lightgreen", E2 = "green", E3 = "green3", E4 = "green4", E5 = "olivedrab4"),
annotationTrack = c("name", "grid"), # Display variable names and circles, no coordinate axis
col = hcl.colors(15), # The color of the stripes
transparency = 0.5, # Transparency
directional = 1, # The direction of the strip
link.lwd = 1, # Width
link.lty = 2, # Type
link.border = 1) # Color
circos.clear()
dev.off()
png
2
# Load package
library(ggplot2)
library(ggprism)
library(dplyr)
library(plyr)
library(dplyr)
library(ggalluvial)
library(tidyverse)
library(tidyr)
library(reshape2)
library(ggpubr)
# Set theme
mytheme = theme_bw() + theme(text = element_text(family = "sans", size = 6))+
theme(#legend.position="none",
legend.text = element_text(size=12),
legend.title = element_blank(),
panel.background = element_blank(),
panel.grid = element_blank(),
axis.text.y = element_text(size=12, colour="black", family = "sans", angle = 0),
axis.text.x = element_text(size=12, colour="black", family = "sans", angle = 0, hjust = 0),
axis.title= element_text(size=12),
strip.text.x = element_text(size=12, angle = 0),
strip.text.y = element_text(size=12, angle = 0),
plot.title = element_text(size=12, angle = 0),
strip.background.x = element_rect(fill = "#E5E4E2", colour = "black", size = 0.2))+
theme(axis.text.x=element_text(angle=0,vjust=1, hjust=0.6))+
theme(axis.line = element_line(size = 0.1, colour = "black"))
# Sum of Phylum
df3_p <- read.table(file = "data/sum_p.txt", sep = "\t", header = T, check.names = FALSE)
design <- read.table(file = "data/group.txt", sep = "\t", header = T, row.names=1)
data_p<-aggregate(.~ Phylum,data=df3_p,sum)
rownames(data_p) = data_p$Phylum
data_p = data_p[, -1]# 17
# write.csv(data_p, "results/779samples_phylum.csv")
# Decreased sort by abundance
mean_sort = data_p[(order(-rowSums(data_p))), ]
mean_sort = as.data.frame(mean_sort)
mean_sort2 = t(mean_sort)
mean_sort2 = mean_sort2[order(-mean_sort2[,1]),]
mean_sort3 = t(mean_sort2)
mean_sort3 = apply(mean_sort3, 2, function(x) x/sum(x))
mean_sort3 = as.data.frame(mean_sort3)
# Filter Top 5, and other group into low abundance (relative abundance < 1%)
other = colSums(mean_sort3[6:dim(mean_sort3)[1], ])
mean_sort3 = mean_sort3[(6 - 1):1, ]
mean_sort3 = rbind(other,mean_sort3)
rownames(mean_sort3)[1] = c("others")
mean_sort3 = as.data.frame(mean_sort3)
# Stackplot for each sample
sampFile = data.frame(sample = row.names(design), group = design$Group,
row.names = row.names(design))
mean_sort3$tax = rownames(mean_sort3)
# Calculate average relative abundance for each group
mat_t = t(mean_sort3)
mat_t2 = merge(sampFile, mat_t, by = "row.names")
mat_t2 = mat_t2[,c(-1,-2)]
mat_t2 = as.data.frame(mat_t2)
mat_t2$group = as.factor(mat_t2$group)
mat_t3 = mat_t2[, -1]
mat_t3 = mutate_all(mat_t3, as.numeric)
mat_t3$group = mat_t2$group
mat_t3 = as.data.frame(mat_t3)
mat_mean2 = aggregate(.~group, data = mat_t3, FUN=function(x) mean(x))
mat_mean_final = do.call(rbind, mat_mean2)[-1,]
geno = mat_mean2$group
colnames(mat_mean_final) = geno
mean_sort=as.data.frame(mat_mean_final)
# data collation
mean_sort$tax = rownames(mean_sort)
mean_sort4 = as.data.frame(mean_sort)
mean_sort4$tax = mean_sort$tax
data_all22 = as.data.frame(melt(mean_sort4, id.vars=c("tax")))
data_all22 = data_all22[order(-data_all22$value), ]
# Plot
p_phylum01 = ggplot(data_all22, aes(x=factor(variable, levels = unique(variable)),
y = value, fill = factor(tax, levels = unique(tax)),
stratum = factor(tax, levels = unique(tax)),
alluvium = factor(tax, levels = unique(tax)))) +
geom_bar(stat = "identity", position = "fill", width=0.2)+
scale_y_continuous(labels = scales::percent, expand = c(0,0)) +
coord_cartesian(ylim = c(0,1))+
xlab("")+
ylab("Percentage (%)")+ theme_classic()+
guides(fill=guide_legend(title="Phylum"))+
theme(legend.key.size = unit(0.4, "cm"))+
theme(text = element_text(family = "sans", size = 8))+
theme(#legend.position="none",
legend.text = element_text(size=12),
legend.title = element_blank(),
panel.background = element_blank(),
panel.grid = element_blank(),
axis.text.y = element_text(size=12, colour="black",
family = "sans", angle = 0),
axis.text.x = element_text(size=12, colour="black",
family = "sans", angle = 0, hjust = 0),
axis.title= element_text(size=12),
strip.text.x = element_text(size=12, angle = 0),
strip.text.y = element_text(size=12, angle = 0),
plot.title = element_text(size=12, angle = 0),
strip.background.x = element_rect(fill = "#E5E4E2", colour = "black", size = 0.2))+
theme(axis.text.x=element_text(angle=0,vjust=1, hjust=0.6))+
theme(axis.line = element_line(size = 0.1, colour = "black"))+
scale_fill_manual(values=c("#e1abbc","#edd064","#0eb0c8","#f2ccac","#a1d5b9","#6a73cf")) +
scale_color_manual(values=c("#e1abbc","#edd064","#0eb0c8","#f2ccac","#a1d5b9","#6a73cf"))+
geom_col(width = 0.5, color=NA)
ggsave(paste("results/Community_diversity_analysis/species_composition/Phylum_composition_stack",".pdf", sep=""), p_phylum01, width=79 * 1.5, height=69 * 1.5, unit='mm')
p_phylum01
# Comparison of Microbial Differences under Classification Levels in Box plots
data_p2 <- data_p
mean_sort = data_p2[(order(-rowSums(data_p2))), ]#Decreasing order by relative abundance
mean_sort = as.data.frame(mean_sort)
mean_sort2 = t(mean_sort)
mean_sort2 = mean_sort2[order(-mean_sort2[,1]),]
mean_sort3 = t(mean_sort2)
mean_sort3 = apply(mean_sort3, 2, function(x) x/sum(x))
mean_sort3 = as.data.frame(mean_sort3)
# Filter Top 5, and other group into low abundance (<1%)
mean_sort3 = mean_sort3[(6 - 1):1, ]
mean_sort3 = t(mean_sort3)
mean_sort3 = as.data.frame(mean_sort3)
mean_sort3$group = rownames(mean_sort3)
mean_sort3$group = gsub("[0-9]","", mean_sort3$group)
# wilcox test, adjusted by BH method and retain adjusted-p value < 0.05
diff_phylum <- mean_sort3 %>%
select_if(is.numeric) %>%
map_df(~ broom::tidy(wilcox.test(. ~ group,data = mean_sort3, conf.int = TRUE)), .id = 'var')
diff_phylum$padjust <- p.adjust(diff_phylum$p.value,"BH")
write.csv(diff_phylum, "results/Community_diversity_analysis/species_composition/Phylum_selected_wilcox_test.csv")
# melt data from wide to long
data_long_m<-melt(mean_sort3, id.vars = c("group"),
measure.vars = c('Firmicutes','Bacteroidetes','Proteobacteria',
'Actinobacteria','Verrucomicrobia'),
variable.name = c('Phylum'),
value.name = 'value')
# Boxplot,Default significance test method was Wilcoxon Rank Sum and Signed Rank Tests
p_phylum02 <- ggplot(data_long_m,aes(x=Phylum,y=value,fill=group))+
stat_boxplot(geom = "errorbar",width=0.4,position=position_dodge(0.8))+
geom_boxplot(width=0.6,alpha=1,position=position_dodge(0.8), outlier.shape = NA)+mytheme+
theme(legend.position = "top")+
stat_compare_means(aes(group=group), method = "wilcox.test",label="p.signif")+
labs(x = "Phylum", y = "Percentage (%)")+
scale_y_continuous(labels = scales::percent, expand = c(0,0.1))+
geom_jitter(aes(color=group),
shape=21, size=0.6,alpha=0.5,
fill="transparent",
position = position_jitterdodge(jitter.width = 0.2, dodge.width = 0.8)
)+
scale_fill_manual(values = c("#4e8397","#ff8066"))+
scale_color_manual(values = c("#4e8397","#ff8066"))+
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1))
ggsave(paste("results/Community_diversity_analysis/species_composition/Phylum_select_wilcox",".pdf", sep=""), p_phylum02, width=109 * 1.5, height=89 * 1.5, unit='mm')
p_phylum02
library(patchwork)
p_phylum <- p_phylum01 + p_phylum02 + plot_layout(ncol = 2, widths = c(1, 4))
p_phylum
ggsave(paste("results/Community_diversity_analysis/species_composition/Phylum_stackplot_wilcox",".pdf", sep=""), p_phylum, width=159 * 1.5, height=79 * 1.5, unit='mm')
# install.packages("circlize")
library(circlize) # 载入软件包
library(reshape2)
# 导入或构建数据
# Load data
otutab2 = read.table("data/otutab2.txt", header=T, row.names=1, sep="\t", comment.char="", stringsAsFactors = F)
# Phylum level
otutab_p <- otutab2[, c(2, 8:25)]
# Sum of phylum
otutab_p2 <- aggregate(.~ Phylum, data = otutab_p, sum)
rownames(otutab_p2) = otutab_p2$Phylum
otutab_p2 = otutab_p2[, -1]# 1963 species
otutab_p3 <- t(otutab_p2)
otutab_p3 <- as.data.frame(otutab_p3)
otutab_p3$group <- rownames(otutab_p3)
#otutab_p3$group = gsub("[0-9]","", otutab_p3$group)
# 将宽数据转换为长数据 (Convert wide to long)
long_data02 <- melt(otutab_p3,
id.vars = "group",
variable.name = "bacteria",
value.name = "abundance"
)
long_data02$group2 <- long_data02$group
long_data02$group2 = gsub("[0-9]","", long_data02$group2)
# write.csv(long_data02, "results/Species_composition_circle_strack_bar_diagram.csv")
long_data03 = read.table("data/Species_composition_circle_strack_bar_diagram.txt",
header=T, row.names=1, sep="\t", comment.char="", stringsAsFactors = F)
# 给数据集增加一列用来做x (Add a column to the data set for x)
# library(tidyverse)
# long_data01 %>%
# ggpubr::mutate(new_x = rep(paste0('X',formatC(1:6,width = 2,flag = 0)),each=8)) -> dat01
# 最基本的堆积柱形图 (Plot stackplot)
library(ggplot2)
ggplot(data = long_data03,aes(x=group,y=abundance,fill=bacteria))+
geom_bar(stat = "identity",position = "fill")
# 变成环状 (circlize)
ggplot(data = long_data03,aes(x=group,y=abundance,fill=bacteria))+
geom_bar(stat = "identity",position = "fill")+
coord_polar()+
theme_bw()+
ylim(-1,NA)
# 修改细节 (Change details)
p1 <- ggplot(data = long_data03,aes(x=group,y=abundance,fill=bacteria))+
geom_bar(stat = "identity",position = "fill")+
coord_polar()+
#scale_x_discrete(expand = expansion(add = 0),
# labels=coalesce(long_data03$group[seq(1,348,8)],""))+
scale_y_continuous(limits = c(-1,NA))+
theme(axis.text.x = element_text(angle = cumsum(c(90,-rep(12,15))),
vjust=0,hjust = 1),
panel.background = element_blank(),
axis.text.y = element_blank(),
axis.title = element_blank(),
axis.ticks = element_blank())
p1
unique(long_data03$bacteria)
[1] "Acidobacteria" "Actinobacteria"
[3] "Armatimonadetes" "Bacteroidetes"
[5] "Candidatus_Saccharibacteria" "Chlamydiae"
[7] "Chloroflexi" "Firmicutes"
[9] "Ignavibacteriae" "Nitrospirae"
[11] "Planctomycetes" "Proteobacteria"
[13] "Spirochaetes" "Thaumarchaeota"
[15] "Unassigned" "Verrucomicrobia"
[17] NA
p2 <- p1 +
annotate(geom = "text",x=4,y=-0.1,label="Group_A",angle=-60)+
annotate(geom = "text",x=9,y=-0.1,label="Group_B",angle=10)+
annotate(geom = "text",x=16,y=-0.1,label="Group_C",angle=-110)+
scale_fill_manual(values = c("Acidobacteria"="#98d09d","Actinobacteria"="#d7e698",
"Armatimonadetes"="#5ebcc2","Bacteroidetes"="#46a9cb",
"Candidatus_Saccharibacteria"="#dadada","Chlamydiae"="#fbf398",
"Chloroflexi"="#5791c9","Firmicutes"="#7a76b7",
"Ignavibacteriae"="#f7a895","Nitrospirae"="#e77381",
"Planctomycetes"="#945893","Proteobacteria"="#5abf7e",
"Spirochaetes"="#9b8191","Thaumarchaeota"="#8f888b",
"Unassigned"="#ffe8d2","Verrucomicrobia"="#946f5c"),
#limits=c("EX","EW","CR","EN","VU","DD","NT","LC"),
name="")
p2
ggsave(paste("results/Community_diversity_analysis/species_composition/Phylum_circle_stackplot",".pdf", sep=""), p2, width=159 * 1.5, height=79 * 1.5, unit='mm')
# 使用三元图展示三种处理的共有OTU或菌群
# Use a ternary plot to show shared OTUs or microbial communities among three treatments
# install.packages("circlize")
library(circlize)
library(reshape2)
library(ggtern)
# 导入或构建数据
# Load data
otutab2 = read.table("data/otutab2.txt", header=T, row.names=1, sep="\t", comment.char="", stringsAsFactors = F)
# Phylum level
otutab_p <- otutab2[, c(2, 8:25)]
# Sum of phylum
otutab_p2 <- aggregate(.~ Phylum, data = otutab_p, sum)
rownames(otutab_p2) = otutab_p2$Phylum
otutab_p2 = otutab_p2[, -1]# 1963 species
otutab_p3 <- t(otutab_p2)
otutab_p3 <- as.data.frame(otutab_p3)
otutab_p3$group <- rownames(otutab_p3)
otutab_p3$group = gsub("[0-9]","", otutab_p3$group)
otutab_p4 <- aggregate(.~ group, data = otutab_p3, mean)
rownames(otutab_p4) <- otutab_p4$group
otutab_p5 <- as.data.frame(t(otutab_p4))
otutab_p5 <- otutab_p5[-1, ]
#write.csv(otutab_p5, "results/otutab_p5.csv")
otutab_p6 = read.table("data/otutab_p5.txt", header=T, row.names=1, sep="\t", comment.char="", stringsAsFactors = F)
# Plot
p1 <- ggtern(data=otutab_p6,ggtern::aes(KO,OE,WT)) +geom_mask() +
geom_point(ggtern::aes(size=Average,color=type),alpha=1)+ggtern::theme_bw() +
theme(axis.text=element_blank(),
axis.ticks=element_blank())
p1
ggtern::ggsave(paste("results/Community_diversity_analysis/specific_microbe/Ternary_diagram",".pdf", sep=""), p1, width=159 * 1.5, height=79 * 1.5, unit='mm')
# Load packages
library(ggVennDiagram)
library(ggplot2)
# Load data
otu_tax<-read.csv("data/test_otu.csv",row.names = 1)
# Select subsets
df1<-rownames(otu_tax[1:120,])
df2<-rownames(otu_tax[50:250,])
df3<-rownames(otu_tax[200:400,])
# Venn list
Venn_data<-list(Soil=df1,Env1=df2,Micro=df3)
# Plot
p1 <- ggVennDiagram(Venn_data,edge_lty = "dashed",edge_size = 0.1)+
scale_fill_distiller(palette = "RdBu")
p1
ggplot2::ggsave(paste("results/Community_diversity_analysis/specific_microbe/Total_microbe_Specific microbe_venn_diagram",".pdf", sep=""), p1, width=159 * 1.5, height=79 * 1.5, unit='mm')
# 绘制Venn圈图
# install.packages("VennDiagram")
library(VennDiagram)
# Load data
otutab2 = read.table("data/otutab2.txt", header=T, row.names=1, sep="\t", comment.char="", stringsAsFactors = F)
# Phylum level
otutab_p <- otutab2[, c(8:25)]
color1 <- scales::alpha("#99CC00",0.9)
color2 <- scales::alpha("#c77cff",0.9)
color3 <- scales::alpha("#f8766d",0.9)
color4 <- scales::alpha("#F3C300",0.8)
color5 <- scales::alpha("#FF99CC",0.7)
#label_alpha = 0去除文字标签底色;
pdf("results/Community_diversity_analysis/specific_microbe/venn_ex2.pdf",width = 8,height = 7)
ggVennDiagram(otutab_p[1:2], set_size = 8,edge_size = 1.2,label_size =6,label_alpha=0) +
scale_fill_gradient(low="white",high =color2)
dev.off()
png
2
pdf("results/Community_diversity_analysis/specific_microbe/venn_ex3.pdf",width = 8,height = 7)
ggVennDiagram(otutab_p[1:3], set_size = 8,edge_size = 1.2,label_size =6,label_alpha=0) +
scale_fill_gradient(low="white",high =color3)
dev.off()
png
2
pdf("results/Community_diversity_analysis/specific_microbe/venn_ex4.pdf",width = 8,height = 7)
ggVennDiagram(otutab_p[1:4], label_alpha=0) +
scale_fill_gradient(low="white",high =color4 ,guide="none")
dev.off()
png
2
pdf("results/Community_diversity_analysis/specific_microbe/venn_ex5.pdf",width = 8,height = 7)
ggVennDiagram(otutab_p[1:5], label_alpha=0,label_size =3) +
scale_color_brewer(palette = "Paired")+
scale_fill_gradient(low="white",high = color5)
dev.off()
png
2
# Plot UpSet
# Load packages
library(UpSetR)
library(openxlsx)
library(RColorBrewer)
library(ggplot2)
# Set seed
set.seed(123)
# Set row numbers
n_rows <- 100
# Randomly select a number between 3 and 5 as the number of repetitions
n_rep <- sample(3:5, 1)
# 随机选择一些数字进行重复
# Randomly select some numbers to repeat
repeated_values <- sample(1:100, n_rep, replace = FALSE)
# 生成三列数据,每列包含相同的数字
# Generate three columns of data, each containing the same number
data <- data.frame(
column1 = c(repeated_values, sample(1:100, n_rows - n_rep, replace = TRUE)),
column2 = c(repeated_values, sample(1:100, n_rows - n_rep, replace = TRUE)),
column3 = c(repeated_values, sample(1:100, n_rows - n_rep, replace = TRUE))
)
# Check data
head(data)
column1 column2 column3
1 79 79 79
2 51 51 51
3 14 14 14
4 67 67 67
5 42 42 42
6 50 85 72
# nsets: 最多展示多少个集合数据(How many sets of data can be displayed at most?)
# nintersects: 展示多少交集(How many intersections are shown)
# mb.ratio:点点图和条形图的比例。(Ratio of dot plots and bar graphs.)
# order.by:交集如何排序。这里先根据freq,然后根据degree(How to sort the intersection. Here we sort by frequency first, then by degree)
# decreasing:变量如何排序。这里表示freq降序,degree升序(How to sort the variables. Here, freq is in descending order and degree is in ascending order)
# Plot
upset(fromList(data))
# Adjusted and enhanced composite plot
pdf("results/Community_diversity_analysis/specific_microbe/upset01.pdf",width = 10,height = 7)
upset(fromList(data),
nsets=length(data),
nintersects=30,#显示前多少个
sets=c("column1","column2","column3"),
number.angles = 0,
point.size=4,
line.size=1,
mainbar.y.label="Intersection size",
main.bar.color = 'black',
matrix.color="black",
sets.x.label="Set size",
sets.bar.color=brewer.pal(3,"Set1"),
mb.ratio = c(0.7, 0.3),
order.by = "freq",
text.scale=c(1.5,1.5,1.5,1.5,1.5,1),
shade.color="red"
)
dev.off()
png
2
# 高亮显示特定几个集合的交集
# Highlight the intersection of specific sets
pdf("results/Community_diversity_analysis/specific_microbe/upset02.pdf",width = 10,height = 7)
upset(fromList(data),
nsets=length(data),
nintersects=30,
sets=c("column1","column2","column3"),
number.angles = 0,
point.size=4,
line.size=1,
mainbar.y.label="Intersection size",
main.bar.color = 'black',
matrix.color="black",
sets.x.label="Set size",
sets.bar.color=brewer.pal(3,"Set1"),
mb.ratio = c(0.7, 0.3),
order.by = "freq",
text.scale=c(1.5,1.5,1.5,1.5,1.5,1),
shade.color="red",
# 设置自己想要展示的特定组的交集
# Set the intersection of the specific groups you want to display
queries=list(list(query=intersects,params=list("column1","column2"),color="red",active=T),
list(query=intersects,params=list("column1","column3"),color="blue",active=T),
list(query=intersects,params=list("column1","column2","column3"),color="green",active=T)
)
)
dev.off()
png
2
# Load packages
library(MetaNet)
library(pcutils)
# Load data
otutab2 = read.table("data/otutab2.txt", header=T, row.names=1, sep="\t", comment.char="", stringsAsFactors = F)
# Select data
otutab_p <- otutab2[, c(2, 8:25)]
# Sum of phylum
otutab_p2 <- aggregate(.~ Phylum, data = otutab_p, sum)
rownames(otutab_p2) = otutab_p2$Phylum
otutab_p2 = otutab_p2[, -1]
otutab_p3 <- t(otutab_p2)
otutab_p3 <- as.data.frame(otutab_p3)
otutab_p3$group <- rownames(otutab_p3)
otutab_p3$group = gsub("[0-9]","", otutab_p3$group)
otutab_p4 <- aggregate(.~ group, data = otutab_p3, mean)
rownames(otutab_p4) <- otutab_p4$group
otutab_p4 <- as.data.frame(t(otutab_p4))
otutab_p4 <- otutab_p4[-1, ]
pdf("results/Community_diversity_analysis/specific_microbe/venn_network01.pdf",width = 8,height = 7)
venn(otutab_p4, mode = "network")
dev.off()
png
2
# Another options
# EVenn: Easy to create repeatable and editable Venn diagrams and Venn networks online
# https://mp.weixin.qq.com/s/WQXq7ssRb96ZBVNqdOYBig
# https://mp.weixin.qq.com/s/aeu_teN5DRGB6DvYfDg_QQ
# 安装并加载所需的R包
# Install and load the required R packages
# install.packages("remotes")
# remotes::install_github("davidsjoberg/ggsankey")
library(ggsankey)
library(ggalluvial)
library(ggplot2)
# 读取数据
# Read data
df01 <- read.table(file = "data/data_sankey3.txt", sep = "\t", header = TRUE, check.names = FALSE)
data <- df01
# 将数据转换为lodes形式
# Convert data to lodes form
df <- to_lodes_form(data[, 1:ncol(data)],
axes = 1:ncol(data),
id = "value")
# 绘制桑基图(Sankey diagram)
# Draw Sankey diagram
col <- rep(c('#0ca9ce', '#78cfe5', '#c6ecf1', '#ff6f81', '#ff9c8f', '#ffc2c0', '#d386bf',
'#cdb1d2', '#fae6f0', '#eb6fa6', '#ff88b5', '#00b1a5', "#ffa68f", "#ffca75", "#97bc83", "#acd295",
"#00ada1", "#009f93", "#ace2da", "#448c99", "#00b3bc", "#b8d8c9", "#db888e", "#e397a4", "#ead0c7",
"#8f9898", "#bfcfcb"), 6)
p1 <- ggplot(df, ggplot2::aes(x = x, fill = stratum, label = stratum,
stratum = stratum, alluvium = value), width = 0.1) +
geom_flow(width = 0.1,
curve_type = "sine",
alpha = 0.6,
color = 'white',
size = 0.05) +
geom_stratum(width = 0.1, color = "white") +
geom_text(stat = 'stratum', size = 2.5, color = 'black') +
scale_fill_manual(values = col) +
ggplot2::theme_void() +
theme(legend.position = 'none')
# 保存绘图结果
# Save the plot
ggplot2::ggsave(filename = "results/Community_diversity_analysis/sankey_plot/sankey_plot01.pdf", plot = p1, width = 7, height = 5, useDingbats = FALSE, limitsize = FALSE)
# 显示绘图结果
# Display the plot
p1
library(dplyr)
library(ggalluvial)
library(tidyverse)
library(tidyr)
library(reshape2)
library(ggpubr)
# sum of Phylum
df3_p <- read.table(file = "data/sum_p.txt", sep = "\t", header = T, check.names = FALSE)
design <- read.table(file = "data/group.txt", sep = "\t", header = T, row.names=1)
data_p<-aggregate(.~ Phylum,data=df3_p,sum)
rownames(data_p) = data_p$Phylum
data_p = data_p[, -1]# 17
#write.csv(data_p, "results/779samples_phylum.csv")
# Comparison of Microbial Differences under Classification Levels in Box plots
data_p2 <- data_p
mean_sort = data_p2[(order(-rowSums(data_p2))), ]#Decreasing order by relative abundance
mean_sort = as.data.frame(mean_sort)
mean_sort2 = t(mean_sort)
mean_sort2 = mean_sort2[order(-mean_sort2[,1]),]
mean_sort3 = t(mean_sort2)
mean_sort3 = apply(mean_sort3, 2, function(x) x/sum(x))
mean_sort3 = as.data.frame(mean_sort3)
# Filter Top 5, and other group into low abundance (<1%)
#mean_sort3 = mean_sort3[(12 - 1):1, ]
mean_sort3 = t(mean_sort3)
mean_sort3 = as.data.frame(mean_sort3)
mean_sort3$group = rownames(mean_sort3)
mean_sort3$group = sub("[0-9]","_", mean_sort3$group)
mean_sort3$group = gsub("[0-9]","", mean_sort3$group)
# 去掉所有数据都是0的列
library(dplyr)
mean_sort4 <- mean_sort3 %>%
select(where(~ any(. != 0)))
# 这里的12根据实际表中剩余数量修改
mean_sort5 <- mean_sort4[, -12]
# log10-transformation
mean_sort52 = log10(mean_sort5 + 1e-05)
# z-score standardization
mean_sort6 = apply(mean_sort52, 2, function(x){
return((x-mean(x))/sd(x))
})
#mean_sort6 = t(mean_sort6)
mean_sort6 <- as.data.frame(mean_sort6)
mean_sort6$group <- mean_sort3$group
# wilcox test, adjusted by BH method and retain adjusted-p value < 0.05
diff_phylum <- mean_sort6 %>%
select_if(is.numeric) %>%
map_df(~ broom::tidy(wilcox.test(. ~ group,data = mean_sort6, conf.int = TRUE)), .id = 'var')
diff_phylum$padjust <- p.adjust(diff_phylum$p.value,"BH")
write.csv(diff_phylum, "results/Difference_analysis/Wilcox/Wilcox_test01.csv")
# edgeR
# Load packages
library(edgeR)
# Load data
# 读取数据
df_KO <- read.table(file = "data/ko_gene_data.txt", sep = "\t", header = T, check.names = FALSE)
design <- read.table(file = "data/group_KO2.txt", sep = "\t", header = T, row.names=1)
df_KO2 <- df_KO
rownames(df_KO2) <- df_KO$Gene_family
df_KO2 <- df_KO2[, -1]
design2 <- t(design)
# 构建DGEList对象
# Constructing a DGEList object
dgelist <- DGEList(counts = df_KO2, group = design2)
dgelist
An object of class "DGEList"
$counts
Healthy01 Healthy02 Healthy03 Healthy04 Healthy05
K00001 5513652.000 5.030384e+06 5.011590e+06 5.071771e+06 5.070956e+06
K00002 26183663.270 2.746872e+07 2.277124e+07 2.964312e+07 2.409072e+07
K00003 0.000 0.000000e+00 0.000000e+00 1.058710e+01 0.000000e+00
K00004 0.000 1.055100e+01 0.000000e+00 1.289566e+01 4.103165e+01
K00005 265.022 3.200185e+01 1.353479e+02 1.232836e+02 1.529002e+02
Healthy06 Healthy07 Healthy08 Healthy09 Healthy10
K00001 4.112742e+06 6.706504e+06 7159849.000 5.833764e+06 5.260313e+06
K00002 2.322997e+07 2.542978e+07 24574795.310 2.980676e+07 2.381957e+07
K00003 0.000000e+00 0.000000e+00 0.000 0.000000e+00 0.000000e+00
K00004 0.000000e+00 9.239522e+00 0.000 0.000000e+00 5.735490e+01
K00005 4.719408e+01 1.890843e+02 196.229 6.976540e+01 1.093376e+02
Patients01 Patients02 Patients03 Patients04 Patients05
K00001 5.196891e+06 5.611760e+06 9.980051e+06 6.585471e+06 6.156271e+06
K00002 2.753910e+07 3.440364e+07 3.475113e+07 2.459670e+07 2.896511e+07
K00003 4.333895e+01 5.293551e+01 0.000000e+00 0.000000e+00 0.000000e+00
K00004 0.000000e+00 2.344529e+01 2.238694e+02 0.000000e+00 2.227433e+01
K00005 9.333383e+02 7.053999e+01 4.654725e+02 1.726085e+02 1.669811e+02
Patients06 Patients07 Patients08 Patients09 Patients10
K00001 7.695370e+06 5.269730e+06 1.041806e+07 4.992085e+06 3.976257e+06
K00002 2.431583e+07 2.690927e+07 2.909897e+07 6.317151e+07 2.386349e+07
K00003 5.433096e+02 0.000000e+00 1.251203e+01 9.624639e+00 1.455884e+02
K00004 6.916764e+01 0.000000e+00 7.577184e+01 0.000000e+00 0.000000e+00
K00005 2.569143e+02 1.599218e+02 6.971007e+01 1.307975e+02 1.535119e+02
195 more rows ...
$samples
group lib.size norm.factors
Healthy01 Healthy 31746866 1
Healthy02 Healthy 32561511 1
Healthy03 Healthy 27807172 1
Healthy04 Healthy 34762673 1
Healthy05 Healthy 29207202 1
15 more rows ...
# 过滤低表达的基因(Filtering low expressed genes)
# 相比之下,edgeR推荐根据CPM(count-per-million,每百万碱基中目标基因的read count值)值进行过滤,cpm()用于计算CPM值,使用CPM值为1作为标准,即当某个基因在read count最低的样本(文库)中的count值大于(read count最低的样品count总数/1000000),则保留。
# In contrast, edgeR recommends filtering based on CPM (count-per-million, read counts of the target gene per million bases). cpm() is used to calculate the CPM value, and a CPM value of 1 is used as the standard, that is, when the count value of a gene in the sample (library) with the lowest read count is greater than (total counts of the sample with the lowest read count/1000000), it is retained.
keep <- rowSums(cpm(dgelist) > 1 ) >= 2
dgelist <- dgelist[keep, ,keep.lib.sizes = FALSE]
# 标准化数据(Standardized)
dgelist_norm <- calcNormFactors(dgelist, method = 'TMM')
# 样本无监督聚类(Unsupervised clustering of samples)
plotMDS(dgelist_norm, col = rep(c('red', 'blue'), each = 5), dim = c(1, 2))
# 估算离散值(Estimating discrete values)
group <- design2[1,]
design <- model.matrix(~group)
dge <- estimateDisp(dgelist_norm, design, robust = TRUE)
plotBCV(dge)
# 差异基因分析(Differential gene analysis)
# negative binomial generalized log-linear model
fit <- glmFit(dge, design, robust = TRUE)
lrt <- glmLRT(fit) #统计检验
topTags(lrt)
Coefficient: groupPatients
logFC logCPM LR PValue FDR
K00106 9.206795 0.04061104 20.81904 5.047850e-06 0.0006562206
K00180 8.505768 -0.61463819 18.18007 2.009705e-05 0.0010226081
K00185 8.341201 -0.76684000 17.44843 2.952080e-05 0.0010226081
K00089 6.647059 1.29328702 17.32721 3.146486e-05 0.0010226081
K00105 7.130335 0.33157245 15.72705 7.317012e-05 0.0016010605
K00199 5.448346 2.14159636 15.70840 7.389510e-05 0.0016010605
K00172 6.050092 1.75470139 15.15904 9.882384e-05 0.0016735281
K00109 7.188984 0.27482588 15.08115 1.029863e-04 0.0016735281
K00162 5.630761 1.93221277 14.60246 1.327409e-04 0.0018393508
K00193 6.816308 -0.23316972 14.48222 1.414885e-04 0.0018393508
#write.csv(topTags(lrt, n = nrow(dgelist$counts)), 'npc_glmQLFTest.csv', quote = FALSE)
# DESeq2
# Load packages
library(DESeq2)
# Load data
df_KO <- read.table(file = "data/ko_gene_data.txt", sep = "\t", header = T, check.names = FALSE)
design <- read.table(file = "data/group_KO2.txt", sep = "\t", header = T, row.names=1)
design$group <- factor(design$group)
df_KO2 <- df_KO
rownames(df_KO2) <- df_KO$Gene_family
df_KO2 <- df_KO2[, -1]
identical(colnames(df_KO2), rownames(design))
[1] TRUE
rows_to_keep <- base::intersect(colnames(df_KO2), rownames(design))
group <- design[rows_to_keep,,drop=F]
df_KO3 <- df_KO2[,rows_to_keep]
identical(colnames(df_KO3), rownames(group))
[1] TRUE
colnames(group)[1] <- "Group"
# 过滤所有样本中count值为0的基因(Filter genes with count values of 0 in all samples)
df_KO4 <- df_KO3[rowSums(df_KO3)!=0, ]
DMO <- read.table(file = "data/DMO.txt", sep = "\t", header = T, row.names=1)
df_KO5 <- df_KO4[rownames(df_KO4) %in% rownames(DMO), ]
# 差异分析(Difference analysis)
dds <- DESeq2::DESeqDataSetFromMatrix(countData = round(df_KO5),
colData=group,
design = ~ Group)
dds_res <- DESeq2::DESeq(dds, sfType = "poscounts")
res <- results(dds_res,
tidy=T,
format="DataFrame",
contrast = c("Group","Patients","Healthy"))
# head(res)
# 火山图(Volcano plot)
DEG<-res
logFC_cutoff<-1
DEG$change<-as.factor(ifelse(DEG$padj<0.5&abs(DEG$log2FoldChange)>logFC_cutoff,
ifelse(DEG$log2FoldChange>logFC_cutoff,"UP","DOWN"),
"NOT"))
this_title <- paste0('Cutoff for logFC is ',round(logFC_cutoff,3),
'\nThe number of up gene is ',nrow(DEG[DEG$change =='UP',]) ,
'\nThe number of down gene is ',nrow(DEG[DEG$change =='DOWN',]))
DEG<-na.omit(DEG)
library(ggplot2)
ggplot(data=DEG,ggplot2::aes(x=log2FoldChange,
y=-log10(pvalue),
color=change))+
geom_point(alpha=0.8,size=3)+
labs(x="log2 fold change")+ ylab("-log10 FDR")+
ggtitle(this_title)+ggplot2::theme_bw(base_size = 20)+
theme(plot.title = element_text(size=15,hjust=0.5),)+
scale_color_manual(values=c('#a121f0','#bebebe','#ffad21')) -> p1
p1
# Using generalized fold change
# KO genes
df_KO <- read.table(file = "data/ko_gene_data.txt", sep = "\t", header = T, check.names = FALSE)
df_KO2 <- df_KO
rownames(df_KO2) <- df_KO$Gene_family
df_KO2 <- df_KO2[, -1]
df_KO3 <- apply(df_KO2, 2, function(x) x/sum(x))
df_KO4 <- df_KO3[c(-1,-2), ]
feat.all <- df_KO4
feat.all <- feat.all[, colnames(feat.all) %in% rownames(design)]
feat.all <- feat.all[rowSums(feat.all)!=0, ]
DMO <- read.table(file = "data/DMO.txt", sep = "\t", header = T, row.names=1)
feat.all2 <- feat.all[rownames(feat.all) %in% rownames(DMO), ]
design <- read.table(file = "data/group_KO2.txt", sep = "\t", header = T, row.names=1)
design$Sample_ID <- rownames(design)
meta <- design
stopifnot(all(meta$Sample_ID %in% colnames(feat.all)))
# Calculate generalized fold change
library(dplyr)
result_list <- list()
for (f in row.names(feat.all2)) {
# other metrics
x <- feat.all2[f, meta %>% filter(group=='Patients') %>% pull(Sample_ID)]
y <- feat.all2[f, meta %>% filter(group=='Healthy') %>% pull(Sample_ID)]
# FC
q.p <- quantile(log10(x+1e-8), probs=seq(.1, .9, .05))
q.n <- quantile(log10(y+1e-8), probs=seq(.1, .9, .05))
fc <- sum(q.p - q.n)/length(q.p)
#print(fc)
result_list[f] <- list(col1 = fc)
}
result_df2 <- do.call(rbind, lapply(result_list, data.frame))
colnames(result_df2) <- c("gFC")
#cat('\n')
#res2 <- res[c(-1,-2), ]
res2 <- res
rownames(res2) <- res2$row
res2$gFC <- result_df2$gFC
res2$log2gFC <- log2(res2$gFC+1)
# 火山图(Volcano plot)
DEG2<-res2
#logFC_cutoff<-2
gFC_cutoff <- 0.5
DEG2$change2<-as.factor(ifelse(DEG2$padj<0.5&abs(DEG2$gFC)>gFC_cutoff,
ifelse(DEG2$gFC>gFC_cutoff,"UP","DOWN"),
"NOT"))
this_title <- paste0('Cutoff for gFC is ',round(gFC_cutoff,3),
'\nThe number of up gene is ',nrow(DEG2[DEG2$change2 =='UP',]) ,
'\nThe number of down gene is ',nrow(DEG2[DEG2$change2 =='DOWN',]))
DEG2<-na.omit(DEG2)
library(ggplot2)
ggplot(data=DEG2,ggplot2::aes(x=gFC,
y=-log10(padj),
color=change2))+
geom_point(alpha=0.8,size=3)+
labs(x="gFC")+ ylab("-log10 FDR")+
ggtitle(this_title)+ggplot2::theme_bw(base_size = 20)+
theme(plot.title = element_text(size=15,hjust=0.5),)+
scale_color_manual(values=c('#a121f0','#bebebe','#ffad21')) -> p2
p2
#+xlim(NA,5)+ylim(NA,40) -> p2
write.csv(DEG2, "results/Difference_analysis/edgeR_DESeq2/KO_difference_DMO01.csv")
# 参考:https://mp.weixin.qq.com/s/KZa-L9Fyv-5FIU9j5vPqmQ
# rm(list=ls())
# Load packages
library(DESeq2)
library(ggplot2)
library(tidyverse)
# Load data
OTU <- read.table("data/otutab3.txt", sep = "\t", row.names = 1,stringsAsFactors =FALSE, check.names =FALSE,header=1)
metadata <- read.delim(file = "data/metadata2.txt", sep = '\t', stringsAsFactors = FALSE)
metadata$Group <- as.factor(metadata$Group)
tax <- cbind(rownames(OTU), OTU[, 1:7])
colnames(tax)[1] <- "OTU"
tax <- as.data.frame(tax)
# Select Phylum level data
OTU_phylum <- OTU[, c(2, 8:25)]
# sum of Phylum
OTU_phylum <- aggregate(.~ Phylum, data = OTU_phylum, sum)
rownames(OTU_phylum) = OTU_phylum$Phylum
OTU_phylum = OTU_phylum[, -1]# 1963 species
# Assign the extracted door-level information back to a new column of the tax data frame
tax$phylum <- tax$Phylum
# Filter OTUs with relative abundance below the threshold
otu_relative <- apply(OTU_phylum, 2, function(x){x/sum(x)})
threshold = 0.0005
idx <- rowSums(otu_relative > threshold) >= 1
otu <- as.data.frame(OTU_phylum[idx, ])
otu_relative <- as.data.frame(otu_relative[idx, ])
# DESeq2 differential expression analysis
# Constructing a DESeqDataSet object
dds <- DESeqDataSetFromMatrix(countData = otu, colData = metadata, design = ~Group)
# Normalize the original dds
dds <- DESeq(dds)
# Use the results() function in the DESeq2 package to extract the results of the differential analysis
group='Group'
treatment = 'KO'
control ='OE'
res <- results(dds, contrast=c(group, control, treatment))
# Use the order() function to sort the result res by pvalue value
res = res[order(res$pvalue),]
res
log2 fold change (MLE): Group OE vs KO
Wald test p-value: Group OE vs KO
DataFrame with 15 rows and 6 columns
baseMean log2FoldChange lfcSE stat pvalue
<numeric> <numeric> <numeric> <numeric> <numeric>
Proteobacteria 21656.369 -1.035522 0.218080 -4.74835 2.05079e-06
Planctomycetes 87.803 1.434603 0.322774 4.44460 8.80552e-06
Actinobacteria 10561.924 -0.591404 0.169149 -3.49634 4.71685e-04
Verrucomicrobia 104.606 0.851992 0.255028 3.34078 8.35420e-04
Acidobacteria 160.148 0.606024 0.287930 2.10476 3.53119e-02
... ... ... ... ... ...
Chlamydiae 15.95048 0.27160216 0.499050 0.5442380 0.586278
Firmicutes 685.46414 0.13970686 0.300424 0.4650319 0.641909
Spirochaetes 47.14644 -0.10156322 0.362492 -0.2801803 0.779339
Ignavibacteriae 5.38696 0.08623627 0.872991 0.0987826 0.921311
Nitrospirae 31.29710 -0.00496568 0.396255 -0.0125315 0.990002
padj
<numeric>
Proteobacteria 3.07619e-05
Planctomycetes 6.60414e-05
Actinobacteria 2.35843e-03
Verrucomicrobia 3.13283e-03
Acidobacteria 1.05936e-01
... ...
Chlamydiae 0.799470
Firmicutes 0.802386
Spirochaetes 0.899238
Ignavibacteriae 0.987119
Nitrospirae 0.990002
summary(res)
out of 15 with nonzero total read count
adjusted p-value < 0.1
LFC > 0 (up) : 2, 13%
LFC < 0 (down) : 2, 13%
outliers [1] : 0, 0%
low counts [2] : 0, 0%
(mean count < 5)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results
# Save results
res <- data.frame(res, stringsAsFactors = FALSE, check.names = FALSE)
write.table(res, 'results/Difference_analysis/edgeR_DESeq2/control_treat.DESeq2.txt', col.names = NA, sep = '\t', quote = FALSE)
# Screening of differential OTUs
res1 <- res[order(res$padj, res$log2FoldChange, decreasing = c(FALSE, TRUE)), ]
# log2FC≥1 & padj<0.01 indicates up, representing significantly up-regulated OTUs
# log2FC≤-1 & padj<0.01 indicates down, representing significantly down-regulated OTUs.
# The remaining symbols are none, representing non-differential OTUs.
res1[which(res1$log2FoldChange >= 1 & res1$padj < 0.01),'sig'] <- 'up'
res1[which(res1$log2FoldChange <= -1 & res1$padj < 0.01),'sig'] <- 'down'
res1[which(abs(res1$log2FoldChange) <= 1 | res1$padj >= 0.01),'sig'] <- 'none'
write.table(res1, file = 'results/Difference_analysis/edgeR_DESeq2/control_treat.txt', sep = '\t', col.names = NA, quote = FALSE)
# Output the selected differentially expressed genes list
res1_select <- subset(res1, sig %in% c('up', 'down'))
write.table(res1_select, file = 'results/Difference_analysis/edgeR_DESeq2/control_treat.DESeq2.select.txt', sep = '\t', col.names = NA, quote = FALSE)
# Separate output according to up and down
res1_up <- subset(res1, sig == 'up')
res1_down <- subset(res1, sig == 'down')
write.table(res1_up, file = 'results/Difference_analysis/edgeR_DESeq2/control_treat.DESeq2.up.txt', sep = '\t', col.names = NA, quote = FALSE)
write.table(res1_down, file = 'results/Difference_analysis/edgeR_DESeq2/control_treat.DESeq2.down.txt', sep = '\t', col.names = NA, quote = FALSE)
# Extract differential OTUs and merge them
# Check the number of P<0.05 after FDR correction
table(res1$padj<0.05)
FALSE TRUE
11 4
# Extract differential OTU
diff_OTU_deseq2 <-subset(res1, padj < 0.01 & abs(log2FoldChange) > 1)
dim(diff_OTU_deseq2)
[1] 2 7
head(diff_OTU_deseq2)
baseMean log2FoldChange lfcSE stat pvalue
Proteobacteria 21656.369 -1.035522 0.2180802 -4.748354 2.050794e-06
Planctomycetes 87.803 1.434603 0.3227742 4.444601 8.805515e-06
padj sig
Proteobacteria 3.076190e-05 down
Planctomycetes 6.604136e-05 up
write.csv(diff_OTU_deseq2, file= paste("results/Difference_analysis/edgeR_DESeq2/DEOTU_",control,"_vs_",treatment,".csv"))
# Calculate the average abundance value
abundance<-aggregate(t(otu_relative),by=list(metadata$Group),FUN=mean)
abundance<-as.data.frame(t(abundance))
colnames(abundance)<-abundance[1,]
abundance<-abundance[-1,]
abundance<-as.data.frame(lapply(abundance, as.numeric))
res1$abundance <- apply(cbind(abundance$KO,abundance$OE), 1, function(x){mean(x)})
# Merge data
data <- merge(as.data.frame(res1), res1,by="row.names",sort=FALSE,all=F)
# Filter relative abundance below 0.01%
data<-data[(data$abundance.x>0.0001),]
# Convert data to character type
rownames(data) <- as.character(data$Row.names)
tax$OTU <- as.character(tax$OTU)
# Use the match function to find the position of Row.names in the OTU column of tax in the data data frame
indexes <- match(data$Row.names, tax$OTU)
# Use these positional indices to extract information from the phylum column in the tax data frame.
data$phylum <- tax$phylum[indexes]
data$phylum <- data$Row.names
# Convert Pvalue to negative logarithm
data$neglogp = -log(data$pvalue.x)
data<-as.data.frame(cbind(data$Row.names, data$log2FoldChange.x, data$pvalue.x, data$phylum,data$abundance.x, data$neglogp))
colnames(data)<-c("otu","log2FoldChange","pvalue","phylum","Abundance","neglogp")
# Change the data type to numeric
data<-transform(data, Abundance = as.numeric(Abundance), neglogp = as.numeric(neglogp), pvalue= as.numeric(pvalue))
# Marker difference OTU type
data$level = as.factor(ifelse(data$pvalue>=0.05, "nosig", ifelse(data$pvalue<0.05&data$log2FoldChange<0, "enriched","depleted")))
# Save results
write.csv(data, file= paste("results/Difference_analysis/edgeR_DESeq2/OTU",control,"_vs_",treatment,".csv"))
# Manhattan plot
label = unique(data$phylum)
label = label[!(label %in% "Low Abundance")] # Delete low abundance
# Set order
data$phylum = factor(data$phylum, levels = c(label, "Low Abundance"))
data$level = factor(data$level, levels = c("enriched","depleted","nosig"))
# Plot
#data[data$neglogp>30,]$neglogp = 30
Title=paste("Differential OTU in ",control," vs ",treatment)
# OTU as X-axis
p <- ggplot(data, ggplot2::aes(x=otu, y=neglogp, color=phylum, shape=level, size=Abundance)) +
geom_hline(yintercept=-log(0.05), linetype=2, color="lightgrey") +
geom_point(alpha=.6,position=position_jitter(0.5),stroke=2) +
scale_shape_manual(values=c(17,25, 20))+
scale_size(breaks=c(5,10,15))+
labs(x="OTU", y="-log10(P)",title=Title)+
ggplot2::theme_classic()+
theme(axis.ticks.x=element_blank(),axis.text.x=element_blank(),
legend.position="top",
panel.grid = element_blank())
p
# Set color
library(RColorBrewer)
brewer.pal.info
maxcolors category colorblind
BrBG 11 div TRUE
PiYG 11 div TRUE
PRGn 11 div TRUE
PuOr 11 div TRUE
RdBu 11 div TRUE
RdGy 11 div FALSE
RdYlBu 11 div TRUE
RdYlGn 11 div FALSE
Spectral 11 div FALSE
Accent 8 qual FALSE
Dark2 8 qual TRUE
Paired 12 qual TRUE
Pastel1 9 qual FALSE
Pastel2 8 qual FALSE
Set1 9 qual FALSE
Set2 8 qual TRUE
Set3 12 qual FALSE
Blues 9 seq TRUE
BuGn 9 seq TRUE
BuPu 9 seq TRUE
GnBu 9 seq TRUE
Greens 9 seq TRUE
Greys 9 seq TRUE
Oranges 9 seq TRUE
OrRd 9 seq TRUE
PuBu 9 seq TRUE
PuBuGn 9 seq TRUE
PuRd 9 seq TRUE
Purples 9 seq TRUE
RdPu 9 seq TRUE
Reds 9 seq TRUE
YlGn 9 seq TRUE
YlGnBu 9 seq TRUE
YlOrBr 9 seq TRUE
YlOrRd 9 seq TRUE
display.brewer.all(type="qual")
brewer.pal(15, 'Dark2')
[1] "#1B9E77" "#D95F02" "#7570B3" "#E7298A" "#66A61E" "#E6AB02" "#A6761D"
[8] "#666666"
# Set color
p <- p+scale_color_manual(values = c("#1B9E77", "#D95F02", "#7570B3" ,"#E7298A", "#66A61E","#E6AB02" ,"#A6761D",
"#5ebcc2", "#1b868c","#46a9cb", "#5791c9" ,"#7a76b7", "#945893","#9c3d62" ,"#882100"))
p
# Save color
ggplot2::ggsave("results/Difference_analysis/edgeR_DESeq2/man_otu.pdf", p, width = 9, height = 4)
# Phylum level data as X-axis
p<-ggplot(data, ggplot2::aes(x=phylum, y=neglogp, color=phylum, shape=level, size=Abundance)) +
geom_hline(yintercept=-log(0.05), linetype=2, color="lightgrey") +
geom_point(alpha=.6,position=position_jitter(0.5),stroke=2) +
scale_shape_manual(values=c(17, 25, 20))+
scale_size(breaks=c(5,10,20))+
labs(x=NULL, y="-log10(P)",title=Title)+
ggplot2::theme_classic()+
theme(legend.position="top",
panel.grid = element_blank(),
axis.text.x = element_text(angle = 45, hjust = 0.5, vjust = 0.5)
)
p
library(RColorBrewer)
brewer.pal.info
maxcolors category colorblind
BrBG 11 div TRUE
PiYG 11 div TRUE
PRGn 11 div TRUE
PuOr 11 div TRUE
RdBu 11 div TRUE
RdGy 11 div FALSE
RdYlBu 11 div TRUE
RdYlGn 11 div FALSE
Spectral 11 div FALSE
Accent 8 qual FALSE
Dark2 8 qual TRUE
Paired 12 qual TRUE
Pastel1 9 qual FALSE
Pastel2 8 qual FALSE
Set1 9 qual FALSE
Set2 8 qual TRUE
Set3 12 qual FALSE
Blues 9 seq TRUE
BuGn 9 seq TRUE
BuPu 9 seq TRUE
GnBu 9 seq TRUE
Greens 9 seq TRUE
Greys 9 seq TRUE
Oranges 9 seq TRUE
OrRd 9 seq TRUE
PuBu 9 seq TRUE
PuBuGn 9 seq TRUE
PuRd 9 seq TRUE
Purples 9 seq TRUE
RdPu 9 seq TRUE
Reds 9 seq TRUE
YlGn 9 seq TRUE
YlGnBu 9 seq TRUE
YlOrBr 9 seq TRUE
YlOrRd 9 seq TRUE
display.brewer.all(type="qual")
brewer.pal(7, 'Dark2')
[1] "#1B9E77" "#D95F02" "#7570B3" "#E7298A" "#66A61E" "#E6AB02" "#A6761D"
# Set color
p <- p+scale_color_manual(values = c("#1B9E77", "#D95F02", "#7570B3" ,"#E7298A", "#66A61E","#E6AB02" ,"#A6761D",
"#5ebcc2", "#1b868c","#46a9cb", "#5791c9" ,"#7a76b7", "#945893","#9c3d62" ,"#882100"))
p
ggplot2::ggsave("results/Difference_analysis/edgeR_DESeq2/man_otu2.pdf", p, width = 9, height = 6)
## Load data
data <- read.table("data/KEGG_L2.txt",header = TRUE,row.names = 1,sep = "\t")
group <- read.table("data/group_stamp.txt",header = FALSE,sep = "\t")
library(tidyverse)
data <- data*100
data <- data %>% filter(apply(data,1,mean) > 1)
data <- t(data)
data1 <- data.frame(data,group$V2)
colnames(data1) <- c(colnames(data),"Group")
data1$Group <- as.factor(data1$Group)
## t-test
diff <- data1 %>%
select_if(is.numeric) %>%
map_df(~ broom::tidy(t.test(. ~ Group,data = data1)), .id = 'var')
diff$p.value <- p.adjust(diff$p.value,"bonferroni")
diff <- diff %>% filter(p.value < 0.05)
## wilcox
library(tidyverse)
diff1 <- data1 %>%
select_if(is.numeric) %>%
map_df(~ broom::tidy(wilcox.test(. ~ Group,data = data1)), .id = 'var')
diff1$p.value <- p.adjust(diff1$p.value,"bonferroni")
diff1 <- diff %>% filter(p.value < 0.05)
## Drawing data construction
## Left bar chart
abun.bar <- data1[,c(diff$var,"Group")] %>%
gather(variable,value,-Group) %>%
group_by(variable,Group) %>%
dplyr::summarise(Mean = mean(value))
## Scatter plot on the right
diff.mean <- diff[,c("var","estimate","conf.low","conf.high","p.value")]
diff.mean$Group <- c(ifelse(diff.mean$estimate >0,levels(data1$Group)[1],
levels(data1$Group)[2]))
diff.mean <- diff.mean[order(diff.mean$estimate,decreasing = TRUE),]
## Left bar chart
library(ggplot2)
#cbbPalette <- c("#E69F00", "#56B4E9")
cbbPalette <- c("#5791c9", "#5ebcc2")
abun.bar$variable <- factor(abun.bar$variable,levels = rev(diff.mean$var))
p1 <- ggplot(abun.bar,ggplot2::aes(variable,Mean,fill = Group)) +
scale_x_discrete(limits = levels(diff.mean$var)) +
coord_flip() +
xlab("") +
ylab("Mean proportion (%)") +
theme(panel.background = element_rect(fill = 'transparent'),
panel.grid = element_blank(),
axis.ticks.length = unit(0.4,"lines"),
axis.ticks = element_line(color='black'),
axis.line = element_line(colour = "black"),
axis.title.x=element_text(colour='black', size=12,face = "bold"),
axis.text=element_text(colour='black',size=10,face = "bold"),
legend.title=element_blank(),
legend.text=element_text(size=12,face = "bold",colour = "black",
margin = margin(r = 20)),
legend.position = c(-1,-0.1),
legend.direction = "horizontal",
legend.key.width = unit(0.8,"cm"),
legend.key.height = unit(0.5,"cm"))
for (i in 1:(nrow(diff.mean) - 1))
p1 <- p1 + ggplot2::annotate('rect', xmin = i+0.5, xmax = i+1.5, ymin = -Inf, ymax = Inf,
fill = ifelse(i %% 2 == 0, 'white', 'gray95'))
p1 <- p1 +
geom_bar(stat = "identity",position = "dodge",width = 0.7,colour = "black") +
scale_fill_manual(values=cbbPalette)
## Scatter plot on the right
diff.mean$var <- factor(diff.mean$var,levels = levels(abun.bar$variable))
diff.mean$p.value <- signif(diff.mean$p.value,3)
diff.mean$p.value <- as.character(diff.mean$p.value)
p2 <- ggplot(diff.mean,ggplot2::aes(var,estimate,fill = Group)) +
theme(panel.background = element_rect(fill = 'transparent'),
panel.grid = element_blank(),
axis.ticks.length = unit(0.4,"lines"),
axis.ticks = element_line(color='black'),
axis.line = element_line(colour = "black"),
axis.title.x=element_text(colour='black', size=12,face = "bold"),
axis.text=element_text(colour='black',size=10,face = "bold"),
axis.text.y = element_blank(),
legend.position = "none",
axis.line.y = element_blank(),
axis.ticks.y = element_blank(),
plot.title = element_text(size = 15,face = "bold",colour = "black",hjust = 0.5)) +
scale_x_discrete(limits = levels(diff.mean$var)) +
coord_flip() +
xlab("") +
ylab("Difference in mean proportions (%)") +
labs(title="95% confidence intervals")
for (i in 1:(nrow(diff.mean) - 1))
p2 <- p2 + ggplot2::annotate('rect', xmin = i+0.5, xmax = i+1.5, ymin = -Inf, ymax = Inf,
fill = ifelse(i %% 2 == 0, 'white', 'gray95'))
p2 <- p2 +
geom_errorbar(ggplot2::aes(ymin = conf.low, ymax = conf.high),
position = position_dodge(0.8), width = 0.5, size = 0.5) +
geom_point(shape = 21,size = 3) +
scale_fill_manual(values=cbbPalette) +
geom_hline(ggplot2::aes(yintercept = 0), linetype = 'dashed', color = 'black')
p3 <- ggplot(diff.mean,ggplot2::aes(var,estimate,fill = Group)) +
geom_text(ggplot2::aes(y = 0,x = var),label = diff.mean$p.value,
hjust = 0,fontface = "bold",inherit.aes = FALSE,size = 3) +
geom_text(ggplot2::aes(x = nrow(diff.mean)/2 +0.5,y = 0.85),label = "P-value (corrected)",
srt = 90,fontface = "bold",size = 5) +
coord_flip() +
ylim(c(0,1)) +
theme(panel.background = element_blank(),
panel.grid = element_blank(),
axis.line = element_blank(),
axis.ticks = element_blank(),
axis.text = element_blank(),
axis.title = element_blank())
## Patchwork
library(patchwork)
p <- p1 + p2 + p3 + plot_layout(widths = c(4,6,2))
## Save plots
ggplot2::ggsave("results/Difference_analysis/STAMP/stamp01.pdf",p,width = 10,height = 4)
# Heatmap and Bubble diagram
library(dplyr)
library(ggalluvial)
library(tidyverse)
library(tidyr)
library(reshape2)
library(ggpubr)
# sum of Phylum
df3_p <- read.table(file = "data/sum_p.txt", sep = "\t", header = T, check.names = FALSE)
design <- read.table(file = "data/group.txt", sep = "\t", header = T, row.names=1)
data_p<-aggregate(.~ Phylum,data=df3_p,sum)
rownames(data_p) = data_p$Phylum
data_p = data_p[, -1]# 17
#write.csv(data_p, "results/779samples_phylum.csv")
# Wilcox test
idx = rownames(design) %in% colnames(data_p)
metadata = design[idx, , drop = F]
data_p = data_p[, rownames(metadata)]
norm = t(t(data_p)/colSums(data_p, na = T) * 100)
idx = rowMeans(norm) > 0.01
norm = norm[idx, ]
colSums(norm)
Healthy01 Healthy02 Healthy03 Healthy04 Healthy05 Healthy06 Healthy07
99.94874 99.97143 100.00000 99.99840 100.00000 100.00000 100.00000
Healthy08 Healthy09 Healthy10 Patients01 Patients02 Patients03 Patients04
99.97784 99.97669 100.00000 99.99816 100.00000 100.00000 99.98066
Patients05 Patients06 Patients07 Patients08 Patients09 Patients10
99.98343 100.00000 99.96991 99.99124 100.00000 99.98941
data_p = data_p[idx, ]
compare_pair = "Healthy-Patients"
group_list = strsplit(compare_pair, "-")[[1]]
metadata$group = metadata$Group
idx = metadata$group %in% group_list
sub_metadata = metadata[idx, , drop = F]
sub_dat = as.matrix(data_p[, rownames(sub_metadata)])
method = "wilcox"
idx = sub_metadata$group %in% group_list[1]
GroupA = norm[, rownames(sub_metadata[idx, , drop = F])]
idx = sub_metadata$group %in% group_list[2]
GroupB = norm[, rownames(sub_metadata[idx, , drop = F])]
nrDAF = data.frame(list = rownames(norm), row.names = rownames(norm))
for (i in 1:dim(nrDAF)[1]) {
FC = (mean(GroupA[i, ]) + 1e-07)/(mean(GroupB[i,
]) + 1e-07)
nrDAF[i, 2] = log2(FC)
nrDAF[i, 3] = log2(max(c(GroupA[i, ], GroupB[i, ])) *
10000)
nrDAF[i, 4] = suppressWarnings(wilcox.test(as.numeric(GroupA[i,
]), as.numeric(GroupB[i, ]))$p.value)
}
nrDAF = nrDAF[, -1]
colnames(nrDAF) = c("logFC", "logCPM", "PValue")
nrDAF$FDR = p.adjust(nrDAF$PValue, method = "fdr", dim(nrDAF)[1])
nrDAF$Phylum <- rownames(nrDAF)
sub_dat2 <- as.data.frame(sub_dat)
sub_dat2$Phylum <- rownames(sub_dat2)
dt <- left_join(sub_dat2, nrDAF, by="Phylum")
dt <- as.data.frame(dt)
rownames(dt) <- dt$Phylum
dt2 <- dt[, c(1:20, 25, 22)]
# Select data for heatmap
df <- dt[,1:20]
head(df)
Healthy01 Healthy02 Healthy03 Healthy04 Healthy05
Actinobacteria 0.94162 0.09249 0.39559 1.75714 0.32597
Bacteroidetes 63.76353 79.16039 10.42674 49.23169 60.61995
Candidatus_Saccharibacteria 0.00000 0.00000 0.00000 0.00000 0.00000
Euryarchaeota 0.00000 0.26306 0.00000 0.00000 0.01295
Firmicutes 28.16631 14.50066 74.95015 38.95455 35.54828
Fusobacteria 0.00000 0.10843 2.36739 0.31074 0.00000
Healthy06 Healthy07 Healthy08 Healthy09 Healthy10
Actinobacteria 0.00187 0.72349 0.28244 0.30560 0.03585
Bacteroidetes 47.94422 17.45493 68.45333 72.51388 35.58806
Candidatus_Saccharibacteria 0.00000 0.00209 0.00000 0.00000 0.00000
Euryarchaeota 0.00000 0.00000 0.08505 0.00000 0.00000
Firmicutes 46.17143 75.18337 25.88913 21.80918 46.70028
Fusobacteria 1.21316 0.00000 0.03197 2.80900 0.00000
Patients01 Patients02 Patients03 Patients04
Actinobacteria 1.30915 0.99638 9.03602 0.59828
Bacteroidetes 46.64941 29.23569 7.39858 65.51137
Candidatus_Saccharibacteria 0.00000 0.00790 0.00770 0.00000
Euryarchaeota 0.00000 0.00000 0.00000 0.00937
Firmicutes 47.22529 43.99165 80.61333 28.36682
Fusobacteria 0.00000 0.24494 0.00000 0.00000
Patients05 Patients06 Patients07 Patients08
Actinobacteria 0.72214 11.46197 0.64551 5.04221
Bacteroidetes 36.02456 24.39057 58.35594 20.48283
Candidatus_Saccharibacteria 0.00000 0.01242 0.00000 0.20987
Euryarchaeota 0.00000 0.01093 0.02543 0.00000
Firmicutes 51.26345 51.40997 30.85718 64.31857
Fusobacteria 0.00000 0.00000 0.35127 0.00000
Patients09 Patients10
Actinobacteria 2.30729 3.56827
Bacteroidetes 3.65587 3.28257
Candidatus_Saccharibacteria 0.19536 0.00000
Euryarchaeota 0.00000 0.00000
Firmicutes 32.14359 69.28669
Fusobacteria 0.00000 0.00000
# Percentage
df = apply(df, 2, function(x) x/sum(x))
# log10-transformation
df = log10(df + 1e-05)
# z-score standardization
df = apply(df, 1, function(x){
return((x-mean(x))/sd(x))
})
df = t(df)
df = as.data.frame(df)
library(pheatmap)
library(ggplot2)
# Set color
mycol <- colorRampPalette(c("#0da9ce", "white", "#e74a32"))(100)
# Plot heatmap
p1 <- pheatmap(
df,
scale = 'none',
cluster_rows = F,
cluster_cols = F,
show_colnames = F,
color = mycol
)
# Add group
group <- data.frame(type = c(rep("Healthy",10), rep("Patients",10)))
rownames(group) <- colnames(df)
group_colors <- list(type = c(Healthy = "#698e31", Patients = "#9cbe3f"))
head(group)
type
Healthy01 Healthy
Healthy02 Healthy
Healthy03 Healthy
Healthy04 Healthy
Healthy05 Healthy
Healthy06 Healthy
group_colors
$type
Healthy Patients
"#698e31" "#9cbe3f"
# Plot as group
library("ggheatmap")
p2 <- ggheatmap(df,cluster_rows = F,cluster_cols = F,scale = "none",
color = colorRampPalette(c("#2fa1dd", "white", "#f87669"))(100),
annotation_cols = group,
annotation_color = group_colors,
text_show_cols = NULL)
p2
# Difference bubble chart drawing
dt2$phylum <- rownames(dt2) # add symbol
dt2$x <- c(" ") #Add a new column as the x-axis coordinate
# Set order
dt2$phylum <- factor(dt2$phylum,levels = rev(unique(dt2$phylum)))
# Set theme
mytheme <- ggplot2::theme_bw() +
theme(axis.title.x = element_blank(),
axis.title.y = element_blank(),
axis.text.x = element_text(size = 12, angle = -45),
axis.text.y = element_text(size = 10),
legend.text = element_text(size = 10),
legend.text.align = 0.5,
legend.title = element_text(size = 12, hjust = 0))
# Plot
p3 <- ggplot(dt2, ggplot2::aes(x = x, y = phylum)) +
geom_point(ggplot2::aes(size = logFC,
fill = -log10(FDR)),
color = "black",
shape = 21,
stroke = 0.8) +
scale_size(range = c(3, 10)) +
scale_fill_gradient(low = "#f5c6c6", high = "#cf0000") +
mytheme+
theme(axis.text.y = element_blank(),
plot.background = element_blank(),
text = element_text(size = 14))
p3
library(aplot)
p4 <- p2%>%insert_right(p3,width = 0.25)
p4
pdf("results/Difference_analysis/heatmap_bubble/heatmap_bubble_plot01.pdf", height = 7, width = 7)
p4
dev.off()
png
2
# Comparison of bubble heat map combined with bar chart and box plot
# Load required packages
library(ggplot2)
library(dplyr)
library(scales)
library(RColorBrewer)
library(ggpubr)
library(grid)
library(paletteer)
library(cowplot)
library(magrittr)
library(stringr)
library(egg)
# Load data
dat_gene_num <- read.csv("data/fig5a_1.csv", header = TRUE)
re_gene_com <- read.csv("data/fig5a_2.csv", header = TRUE)
re_gsea <- read.csv("data/fig5a_3.csv", header = TRUE)
# Process data
dat_gene_num <- dat_gene_num %>%
ggpubr::mutate(bug_name = factor(bug_name, levels = unique(re_gene_com$species_update)))
re_gsea <- re_gsea %>%
ggpubr::mutate(
go_id_description = factor(go_id_description, levels = unique(go_id_description)),
species_update = factor(species_update, levels = rev(unique(species_update))),
category = factor(category, levels = unique(category)),
padj_annot1 = case_when(padj >= 0.05 & padj < 0.1 ~ "*", TRUE ~ ""),
padj_annot2 = case_when(padj < 0.05 ~ "#", TRUE ~ "")
)
# Define color palettes
breaklist <- seq(-1, 1, by = 0.001)
red_blue <- rev(brewer.pal(n = 11, name = "RdBu"))
col_red_blue <- colorRampPalette(red_blue)(length(breaklist))
col_panel <- c(paletteer::paletteer_d("ggsci::default_igv")[1:15], "grey70", "grey90")
names(col_panel) <- c(
"Amino acid metabolism", "Bacterial structural components", "Cell motility",
"DNA replication & transcription", "Fatty acid metabolism", "Genetic Rearrangement",
"Glucose metabolism", "Phage and HGT", "Proteolysis", "Quorum sensing",
"DNA methylation", "Signal transduction", "Stress response",
"Virulence and antibiotic resistance", "Damaged DNA repair", "Other", "Unknown"
)
# Plot gene number bar plot
p_gene_num <- ggplot(dat_gene_num, ggplot2::aes(x = log10(gene_num), y = bug_name, fill = gene_ind)) +
geom_bar(stat = "identity", position = "dodge", width = 0.7) +
scale_fill_manual(values = c("#1F4690", "#B73E3E")) +
labs(x = "T2D-associated genes") +
ggplot2::theme_bw() +
theme(
axis.title.y = element_blank(),
axis.text.y = element_text(size = 14, face = "italic", color = "black"),
axis.text.x = element_text(size = 8),
plot.margin = unit(c(5.5, 1, 5.5, 0), "points"),
legend.position = "none"
)
# Plot boxplot
p_box <- ggplot(re_gene_com, ggplot2::aes(y = species_update, x = abs(t_statistic), color = Direction)) +
geom_boxplot(position = position_dodge(0.7), alpha = 0.3, width = 0.7) +
scale_color_manual(values = c("#1F4690", "#B73E3E")) +
labs(x = "Absolute t statistic") +
ggplot2::theme_bw() +
theme(
axis.title.y = element_blank(),
axis.text.x = element_text(size = 8),
axis.text.y = element_blank(),
plot.margin = unit(c(5.5, 1, 5.5, 1), "points"),
legend.position = "none"
)
# Plot bubble plot
p_bub <- ggplot(re_gsea, ggplot2::aes(x = go_id_description, y = species_update, color = ES, size = genenum_cat)) +
geom_point(alpha = 0.7) +
geom_text(ggplot2::aes(label = padj_annot1), color = "black", size = 7, nudge_y = -0.2) +
geom_text(ggplot2::aes(label = padj_annot2), color = "black", size = 4, nudge_y = 0) +
scale_size(range = c(8, 13), breaks = c(1, 2, 3), name = "Num. of Genes") +
labs(x = "Species", y = "GO term") +
scale_color_gradientn(colours = col_red_blue, name = "Enrichment score") +
ggplot2::theme_bw() +
theme(
axis.text.y = element_blank(),
axis.text.x = element_blank(),
axis.title.y = element_blank(),
axis.ticks.x = element_blank(),
axis.title = element_text(size = 13),
plot.margin = unit(c(5.5, 1, 5.5, 1), "points"),
legend.position = "none"
)
# Legend for bubble plot
p_bub_lgd <- ggplot(re_gsea, ggplot2::aes(x = go_id_description, y = species_update, color = ES, size = genenum_cat)) +
geom_point(alpha = 0.8) +
scale_size(range = c(8, 13), breaks = c(1, 2, 3), name = "Num. of Genes") +
geom_text(ggplot2::aes(label = padj_annot1), color = "black", size = 7, nudge_y = -0.2) +
geom_text(ggplot2::aes(label = padj_annot2), color = "black", size = 4, nudge_y = 0) +
labs(x = "Species", y = "GO term") +
scale_color_gradientn(colours = col_red_blue, name = "Enrichment score") +
theme(
axis.title.y = element_blank(),
axis.ticks.x = element_blank(),
axis.ticks.y = element_blank(),
axis.text.x = element_text(size = 13, angle = 45, face = "italic", hjust = 1, vjust = 1),
axis.title = element_text(size = 13),
axis.title.x = element_blank()
)
p_bub_legend <- ggpubr::get_legend(p_bub_lgd)
pdf("results/Difference_analysis/heatmap_bar_box/legend_bubble_plot.pdf", width = 8, height = 5)
grid::grid.draw(p_bub_legend)
dev.off()
png
2
# Plot category annotation
breaks_go <- as.character(unique(re_gsea$go_id_description))
labels_go <- ifelse(str_length(breaks_go) < 35, breaks_go, paste0(str_sub(breaks_go, 1, 30), "...", str_sub(breaks_go, -15, -1)))
p_cat <- ggplot(re_gsea, ggplot2::aes(y = 1, x = go_id_description)) +
geom_tile(ggplot2::aes(fill = category), width = 1) +
labs(fill = "Category") +
scale_fill_manual(values = col_panel) +
coord_cartesian(expand = FALSE) +
scale_x_discrete(breaks = breaks_go, labels = labels_go) +
ggplot2::theme_bw() +
theme(
panel.grid = element_blank(),
axis.text.x = element_text(size = 11, angle = 30, vjust = 1, hjust = 1, color = "black"),
axis.text.y = element_blank(),
axis.ticks.y = element_blank(),
axis.title = element_blank(),
plot.margin = unit(c(0, 5.5, 5.5, 5.5), "points"),
legend.position = "none"
)
# Legend for category annotation
p_cat_lgd <- ggplot(re_gsea, ggplot2::aes(x = 1, y = go_id_description)) +
geom_tile(ggplot2::aes(fill = category), width = 1) +
labs(fill = "Category") +
scale_fill_manual(values = col_panel) +
guides(fill = guide_legend(reverse = FALSE)) +
ggplot2::theme_bw() +
theme(
panel.grid = element_blank(),
axis.text.y = element_text(size = 12, color = "black"),
axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
axis.title = element_text(size = 13),
axis.title.x = element_blank()
)
cat_legend <- cowplot::get_legend(p_cat_lgd)
pdf("results/Difference_analysis/heatmap_bar_box/go_category_legend.pdf", width = 5, height = 5)
grid::grid.draw(cat_legend)
dev.off()
png
2
# Combine all plots into one figure
blank <- ggplot()+ggplot2::theme_void()
pdf("results/Difference_analysis/heatmap_bar_box/heatmap_bar_box01.pdf", width = 15.5, height = 8)
egg::ggarrange(
p_gene_num, p_box, p_bub,
blank, blank, p_cat,
nrow = 2, ncol = 3,
heights = c(5, 0.1),
widths = c(1, 1, 6)
)
dev.off()
png
2
#devtools::install_github("BioSenior/ggvolcano", force = TRUE)
library(ggVolcano)
# load data
# 这里使用的数据是DESeq2前面分析得到的结果(The data used here is the result of the previous analysis of DESeq2)
data_vol <-read.table("data/data_volcano.txt",header=T,sep="\t",row.names=1)
data_vol = as.data.frame(data_vol)
# use add_regulate function to add a regulate column to the DEG result data.
data <- add_regulate(data_vol, log2FC_name = "log2FC",
fdr_name = "padj",log2FC = 1, fdr = 0.05)
#data <- data[, -6]
data$regulate <- data_vol$association
colnames(data)[3] <- "FoldChange"
data$padj2 <- -log10(as.numeric(data$padj))
#logFC = 0.5
#P.Value = 0.05
p_volcano1 <- ggplot(data = data, ggplot2::aes(x = FoldChange, y = padj2)) +
geom_point(alpha = 0.4, size = 3.0, ggplot2::aes(color = regulate)) +
ylab("-log10(Pvalue)") +
scale_color_manual(values = c("#74add1","#a60026", "grey")) +
#scale_color_manual(values = c("#4177a4","#b33e5b", "grey")) +
#scale_color_manual(values = c("#81CBAB","#854888", "grey")) +
geom_vline(xintercept = c(-0.5, 0.5), lty = 4, col = "black", lwd = 0.4) +
geom_hline(yintercept = -log10(0.05), lty = 4, col = "black", lwd = 0.4) +
labs(x = bquote(Log[2]~italic(FC)), y= bquote(atop(-Log[10]~italic(FDR))))+
ggplot2::theme_bw()
# add labels
library(dplyr)
# select top 5 enriched species
up_data1 <- filter(data, data$regulate == "Enriched")
up_data2 <- dplyr::arrange(up_data1, dplyr::desc(up_data1$padj2))
up_data_5 <- up_data2[1:5, ]
# select top 25 depleted species
down_data1 <- filter(data, data$regulate == "Depleted")
down_data2 <- dplyr::arrange(down_data1, desc(down_data1$padj2))
down_data_25 <- down_data2[1:5, ]
# using geom_text_repel() to add labels
library(ggrepel)
p_volcano2 <- p_volcano1 +
geom_text_repel(data = up_data_5, ggplot2::aes(x = FoldChange,
y = padj2,
label = up_data_5$row), size = 3) +
geom_text_repel(data = down_data_25, ggplot2::aes(x = FoldChange,
y = padj2,
label = down_data_25$row), size = 3)+
theme(legend.position = c(0.84, 0.85),panel.grid = element_blank())
ggplot2::ggsave(paste("results//Difference_analysis/volcano_plot/Two_group_volcano_plot",".pdf", sep=""),
p_volcano2, width=100 * 1.5, height=80 * 1.5, unit='mm')
p_volcano2
# Load data
otutab2 = read.table("data/otutab2.txt", header=T, row.names=1, sep="\t", comment.char="", stringsAsFactors = F)
design <- read.delim(file = "data/metadata2.txt", sep = '\t', stringsAsFactors = FALSE)
rownames(design) <- design$SampleID
# Phylum level
otutab_p <- otutab2[, c(2, 8:25)]
# Sum of phylum
otutab_p2 <- aggregate(.~ Phylum, data = otutab_p, sum)
rownames(otutab_p2) = otutab_p2$Phylum
otutab_p2 = otutab_p2[, -1]# 1963 species
# KO vs OE
data_p01 <- otutab_p2[, 1:12]
design01 <- design[1:12, ]
# Wilcox test
idx = rownames(design01) %in% colnames(data_p01)
metadata = design01[idx, , drop = F]
data_p01 = data_p01[, rownames(metadata)]
norm = t(t(data_p01)/colSums(data_p01, na = T) * 100)
idx = rowMeans(norm) > 0.01
norm = norm[idx, ]
colSums(norm)
KO1 KO2 KO3 KO4 KO5 KO6 OE1 OE2
99.99695 99.99161 100.00000 100.00000 100.00000 100.00000 99.99062 99.99084
OE3 OE4 OE5 OE6
99.99707 100.00000 99.99696 99.98102
data_p01 = data_p01[idx, ]
compare_pair = "KO-OE"
group_list = strsplit(compare_pair, "-")[[1]]
metadata$group = metadata$Group
idx = metadata$group %in% group_list
sub_metadata = metadata[idx, , drop = F]
sub_dat = as.matrix(data_p01[, rownames(sub_metadata)])
method = "wilcox"
idx = sub_metadata$group %in% group_list[1]
GroupA = norm[, rownames(sub_metadata[idx, , drop = F])]
idx = sub_metadata$group %in% group_list[2]
GroupB = norm[, rownames(sub_metadata[idx, , drop = F])]
nrDAF = data.frame(list = rownames(norm), row.names = rownames(norm))
for (i in 1:dim(nrDAF)[1]) {
FC = (mean(GroupA[i, ]) + 1e-07)/(mean(GroupB[i,
]) + 1e-07)
nrDAF[i, 2] = log2(FC)
nrDAF[i, 3] = log2(max(c(GroupA[i, ], GroupB[i, ])) *
10000)
nrDAF[i, 4] = suppressWarnings(wilcox.test(as.numeric(GroupA[i,
]), as.numeric(GroupB[i, ]))$p.value)
}
nrDAF = nrDAF[, -1]
colnames(nrDAF) = c("logFC", "logCPM", "PValue")
nrDAF$FDR = p.adjust(nrDAF$PValue, method = "fdr", dim(nrDAF)[1])
nrDAF$Phylum <- rownames(nrDAF)
nrDAF$compared_group <- "KO vs OE"
# KO vs WT
data_p02 <- otutab_p2[, c(1:6, 13:18)]
design02 <- design[c(1:6,13:18), ]
# Wilcox test
idx = rownames(design02) %in% colnames(data_p02)
metadata = design02[idx, , drop = F]
data_p02 = data_p02[, rownames(metadata)]
norm = t(t(data_p02)/colSums(data_p02, na = T) * 100)
idx = rowMeans(norm) > 0.01
norm = norm[idx, ]
colSums(norm)
KO1 KO2 KO3 KO4 KO5 KO6 WT1 WT2
99.99695 99.99161 100.00000 100.00000 100.00000 100.00000 99.99190 100.00000
WT3 WT4 WT5 WT6
99.99446 99.99175 99.99458 99.99181
data_p01 = data_p01[idx, ]
compare_pair = "KO-WT"
group_list = strsplit(compare_pair, "-")[[1]]
metadata$group = metadata$Group
idx = metadata$group %in% group_list
sub_metadata = metadata[idx, , drop = F]
sub_dat = as.matrix(data_p02[, rownames(sub_metadata)])
method = "wilcox"
idx = sub_metadata$group %in% group_list[1]
GroupA = norm[, rownames(sub_metadata[idx, , drop = F])]
idx = sub_metadata$group %in% group_list[2]
GroupB = norm[, rownames(sub_metadata[idx, , drop = F])]
nrDAF2 = data.frame(list = rownames(norm), row.names = rownames(norm))
for (i in 1:dim(nrDAF2)[1]) {
FC = (mean(GroupA[i, ]) + 1e-07)/(mean(GroupB[i,
]) + 1e-07)
nrDAF2[i, 2] = log2(FC)
nrDAF2[i, 3] = log2(max(c(GroupA[i, ], GroupB[i, ])) *
10000)
nrDAF2[i, 4] = suppressWarnings(wilcox.test(as.numeric(GroupA[i,
]), as.numeric(GroupB[i, ]))$p.value)
}
nrDAF2 = nrDAF2[, -1]
colnames(nrDAF2) = c("logFC", "logCPM", "PValue")
nrDAF2$FDR = p.adjust(nrDAF2$PValue, method = "fdr", dim(nrDAF2)[1])
nrDAF2$Phylum <- rownames(nrDAF2)
nrDAF2$compared_group <- "KO vs WT"
# OE vs WT
data_p03 <- otutab_p2[, c(7:18)]
design03 <- design[c(7:18), ]
# Wilcox test
idx = rownames(design03) %in% colnames(data_p03)
metadata = design03[idx, , drop = F]
data_p03 = data_p03[, rownames(metadata)]
norm = t(t(data_p03)/colSums(data_p03, na = T) * 100)
idx = rowMeans(norm) > 0.01
norm = norm[idx, ]
colSums(norm)
OE1 OE2 OE3 OE4 OE5 OE6 WT1 WT2
99.99062 99.99084 99.99707 100.00000 99.99696 99.98102 99.99190 100.00000
WT3 WT4 WT5 WT6
99.99446 99.99175 99.99458 99.99181
data_p03 = data_p03[idx, ]
compare_pair = "OE-WT"
group_list = strsplit(compare_pair, "-")[[1]]
metadata$group = metadata$Group
idx = metadata$group %in% group_list
sub_metadata = metadata[idx, , drop = F]
sub_dat = as.matrix(data_p03[, rownames(sub_metadata)])
method = "wilcox"
idx = sub_metadata$group %in% group_list[1]
GroupA = norm[, rownames(sub_metadata[idx, , drop = F])]
idx = sub_metadata$group %in% group_list[2]
GroupB = norm[, rownames(sub_metadata[idx, , drop = F])]
nrDAF3 = data.frame(list = rownames(norm), row.names = rownames(norm))
for (i in 1:dim(nrDAF3)[1]) {
FC = (mean(GroupA[i, ]) + 1e-07)/(mean(GroupB[i,
]) + 1e-07)
nrDAF3[i, 2] = log2(FC)
nrDAF3[i, 3] = log2(max(c(GroupA[i, ], GroupB[i, ])) *
10000)
nrDAF3[i, 4] = suppressWarnings(wilcox.test(as.numeric(GroupA[i,
]), as.numeric(GroupB[i, ]))$p.value)
}
nrDAF3 = nrDAF3[, -1]
colnames(nrDAF3) = c("logFC", "logCPM", "PValue")
nrDAF3$FDR = p.adjust(nrDAF3$PValue, method = "fdr", dim(nrDAF3)[1])
nrDAF3$Phylum <- rownames(nrDAF3)
nrDAF3$compared_group <- "OE vs WT"
# 合并nrDAF, nrDAF2和nrDAF3
dat <- rbind(nrDAF, nrDAF2, nrDAF3)
# 数据整理和准备
# Reshaping data
log2Foldchange=0.28
adjp=0.1
dat.plot <- dat %>% ggpubr::mutate(
"significance"=case_when(FDR < adjp & logFC>= log2Foldchange ~ 'up',
FDR < adjp &logFC<= -log2Foldchange ~ 'down',
TRUE ~ 'insig'))
# Referring the levels of x axis
dat.plot$compared_group <- factor(dat.plot$compared_group,
levels = c("KO vs OE",
"KO vs WT",
"OE vs WT"))
# 设置背景及需要标记的数据
# Reshaping data for geom_col function
top_marker=5
background.dat <- data.frame(
dat.plot %>% group_by(compared_group) %>% filter(logFC>0) %>%
dplyr::summarise("y.localup"=max(logFC)),
dat.plot %>% group_by(compared_group) %>% filter(logFC<0) %>%
dplyr::summarise("y.localdown"=min(logFC)),
x.local=seq(1:length(unique(dat.plot$compared_group)))
) %>% select(-compared_group.1)
names(background.dat)
[1] "compared_group" "y.localup" "y.localdown" "x.local"
x.number <- background.dat %>% select(compared_group,x.local)
dat.plot <- dat.plot%>% left_join(x.number,by = "compared_group")
names(dat.plot)
[1] "logFC" "logCPM" "PValue" "FDR"
[5] "Phylum" "compared_group" "significance" "x.local"
# selecting top-up and top-down proteins
dat.marked.up <- dat.plot %>% filter(significance=="up") %>%
group_by(compared_group) %>% plyr::arrange(-logFC) %>%
top_n(top_marker,abs(logFC))
dat.marked.down <- dat.plot %>% filter(significance=="down") %>%
group_by(compared_group) %>% plyr::arrange(logFC) %>%
top_n(top_marker,abs(logFC))
dat.marked <- dat.marked.up %>% bind_rows(dat.marked.down)
# referring group information data
dat.infor <- background.dat %>%
ggpubr::mutate("y.infor"=rep(0,length(compared_group)))
names(dat.infor)
[1] "compared_group" "y.localup" "y.localdown" "x.local"
[5] "y.infor"
# Plot
max_overlaps=10
vol.plot <- ggplot()+
geom_col(background.dat,mapping=ggplot2::aes(x.local,y.localup),
fill="grey50",alpha=0.2,width=0.9,just = 0.5)+
geom_col(background.dat,mapping=ggplot2::aes(x.local,y.localdown),
fill="grey50",alpha=0.2,width=0.9,just = 0.5)+
geom_jitter(dat.plot,mapping=ggplot2::aes(x.local,logFC,
color=significance,
fill=significance),
size=1.5,width = 0.4,alpha= 0.4)+
scale_color_manual(values = c("#82677e","#eaebea","#59829e"))+
# scale_color_manual(values = c("#5390b5","#46a9cb","#eaebea","#ffe8d2","#c97aaa","#d56e5e"))+
geom_tile(dat.infor,mapping=ggplot2::aes(x.local,y.infor,fill=compared_group,
color = compared_group),
height=log2Foldchange*1.5,
#color = color.pals[1:length(unique(dat.plot$compared_group))],
color = unique(dat.plot$compared_group),
#fill = color.pals[1:length(unique(dat.plot$compared_group))],
#fill = unique(dat.plot$compared_group),
alpha = 0.6,
width=0.9)+guides(size=guide_legend(title="Count"))+
labs(x=NULL,y="log2 Fold change")+
geom_text(dat.infor,mapping=ggplot2::aes(x.local,y.infor,label=compared_group))+
ggrepel::geom_label_repel(dat.marked.up,mapping=ggplot2::aes(x.local,logFC,label=Phylum,color=significance),
force = 2,size=2,
max.overlaps = max_overlaps,
seed = 233,
min.segment.length = 0,
force_pull = 2,
box.padding = 0.1,
segment.linetype = 3,
segment.color = 'black',
segment.alpha = 0.5,
direction = "x",
hjust = 0.5)+
ggrepel::geom_label_repel(dat.marked.down,mapping=ggplot2::aes(x.local,logFC,label=Phylum,color=significance),
force = 2,size=2,
max.overlaps = max_overlaps,
seed = 233,
min.segment.length = 0,
force_pull = 2,
box.padding = 0.1,
segment.linetype = 3,
segment.color = 'black',
segment.alpha = 0.5,
direction = "x",
hjust = 0.5)+
ggplot2::annotate("text", x=1.5, y=max(background.dat$y.localup)+2,
label=paste0("|log2FC|>=",log2Foldchange," & FDR<0.05"))+
ggplot2::theme_classic()+
theme(
legend.spacing.x=unit(0.1,'cm'),
legend.key.width=unit(0.5,'cm'),
legend.key.height=unit(0.5,'cm'),
legend.background=element_blank(),
legend.box="horizontal",
legend.position = c(0.15,0.72),legend.justification = c(1,0)
)
vol.plot
#saving plot as png and pdf
ggplot2::ggsave('results/Difference_analysis/volcano_plot/Multi_group_vol01.pdf', width = 9, height = 5)
print(vol.plot)
dev.off()
pdf
3
## Another way to draw
# Data preparation
dat <- rbind(nrDAF, nrDAF2, nrDAF3)
# reshaping data
log2Foldchange=0.28
adjp=0.1
dat.plot <- dat %>% ggpubr::mutate(
"significance"=case_when(FDR < adjp & logFC>= log2Foldchange ~ 'up',
FDR < adjp &logFC<= -log2Foldchange ~ 'down',
TRUE ~ 'insig'))
# Referring the levels of x axis
dat.plot$compared_group <- factor(dat.plot$compared_group,
levels = c("KO vs OE",
"KO vs WT",
"OE vs WT"))
dat.plot$ID = row.names(dat.plot)
datv = dat.plot
datv$group <- datv$compared_group
# The for loop selects the top 5 gene symbols of each cluster
tm.g <- function(data){
id = data$group %>% unique()
for (i in 1:length(id)) {
tem = filter(data,group==id[i],significance != "insig") %>%
distinct(ID,.keep_all = TRUE) %>%
top_n(5,abs(logFC))
if (i == 1) {
tem2 = tem
} else {
tem2 = rbind(tem2,tem)
}
}
return(tem2)
}
top <- tm.g(datv)
# First draw the background column, and determine it according to the max and min values of the log2FC data.
head(datv)
logFC logCPM PValue FDR
Acidobacteria -1.3502738 13.384400 0.015151515 0.04329004
Actinobacteria -0.1621734 18.628042 0.132034632 0.16504329
Armatimonadetes -1.6364291 9.019031 0.002164502 0.01623377
Bacteroidetes -0.5878856 16.422834 0.064935065 0.08854782
Candidatus_Saccharibacteria 0.0291015 10.086527 1.000000000 1.00000000
Chlamydiae -1.0089896 9.802508 0.025974026 0.04329004
Phylum compared_group
Acidobacteria Acidobacteria KO vs OE
Actinobacteria Actinobacteria KO vs OE
Armatimonadetes Armatimonadetes KO vs OE
Bacteroidetes Bacteroidetes KO vs OE
Candidatus_Saccharibacteria Candidatus_Saccharibacteria KO vs OE
Chlamydiae Chlamydiae KO vs OE
significance ID group
Acidobacteria down Acidobacteria KO vs OE
Actinobacteria insig Actinobacteria KO vs OE
Armatimonadetes down Armatimonadetes KO vs OE
Bacteroidetes down Bacteroidetes KO vs OE
Candidatus_Saccharibacteria insig Candidatus_Saccharibacteria KO vs OE
Chlamydiae down Chlamydiae KO vs OE
tem = datv %>% group_by(group) %>% dplyr::summarise(max = max(logFC),min = min(logFC)) %>% as.data.frame()
col1<-data.frame(x=tem$group,
y=tem$max)
col2<-data.frame(x=tem$group,
y=tem$min)
# Draw the background column
p1 <- ggplot()+
geom_col(data = col1,
mapping = ggplot2::aes(x = x,y = y),
fill = "#dcdcdc",alpha = 0.6)+
geom_col(data = col2,
mapping = ggplot2::aes(x = x,y = y),
fill = "#dcdcdc",alpha = 0.6)
p1
# Overlay the scatter volcano map on the background column
head(datv)
logFC logCPM PValue FDR
Acidobacteria -1.3502738 13.384400 0.015151515 0.04329004
Actinobacteria -0.1621734 18.628042 0.132034632 0.16504329
Armatimonadetes -1.6364291 9.019031 0.002164502 0.01623377
Bacteroidetes -0.5878856 16.422834 0.064935065 0.08854782
Candidatus_Saccharibacteria 0.0291015 10.086527 1.000000000 1.00000000
Chlamydiae -1.0089896 9.802508 0.025974026 0.04329004
Phylum compared_group
Acidobacteria Acidobacteria KO vs OE
Actinobacteria Actinobacteria KO vs OE
Armatimonadetes Armatimonadetes KO vs OE
Bacteroidetes Bacteroidetes KO vs OE
Candidatus_Saccharibacteria Candidatus_Saccharibacteria KO vs OE
Chlamydiae Chlamydiae KO vs OE
significance ID group
Acidobacteria down Acidobacteria KO vs OE
Actinobacteria insig Actinobacteria KO vs OE
Armatimonadetes down Armatimonadetes KO vs OE
Bacteroidetes down Bacteroidetes KO vs OE
Candidatus_Saccharibacteria insig Candidatus_Saccharibacteria KO vs OE
Chlamydiae down Chlamydiae KO vs OE
p2 <- ggplot()+
geom_col(data = col1,
mapping = ggplot2::aes(x = x,y = y),
fill = "#dcdcdc",alpha = 0.6)+
geom_col(data = col2,
mapping = ggplot2::aes(x = x,y = y),
fill = "#dcdcdc",alpha = 0.6)+
geom_jitter(data = datv,
ggplot2::aes(x =group , y = logFC, color =significance ),
size = 1,
width =0.4)+
scale_color_manual(name=NULL,
values = c("#4393C3","grey40","#FC4E2A"))+
labs(x="",y="log2(FoldChange)")
p2
# Add grouped color block labels to the X-axis
dfcol<-data.frame(x=tem$group,
y=0,
label=tem$group)
# Add grouped color block labels
dfcol$group <- tem$group
# Load color packages
library(RColorBrewer)
library(MetBrewer)
# BiocManager::install("MetBrewer")
# Set color
tile_color <- met.brewer("Thomas",length(tem$group))
# Inlaying color blocks in the image
p3 <- p2 + geom_tile(data = dfcol,
ggplot2::aes(x=x,y=y),
height=0.5,
color = "black",
fill = tile_color,
alpha = 0.6,
show.legend = F)+
geom_text(data=dfcol,
ggplot2::aes(x=x,y=y,label=group),
size =3.5,
color ="white") + ggplot2::theme_classic()
p3
library(ggrepel)
p4<-p3+geom_text_repel(
data=top,
ggplot2::aes(x=group,y=logFC,label=ID),
force = 1.2,
arrow = arrow(length = unit(0.008, "npc"),
type = "open", ends = "last"))
p4
# Remove background
p5 <- p4+
ggplot2::theme_minimal()+
theme(
axis.title = element_text(size = 18,
color = "black",
face = "bold"),
axis.line.y = element_line(color = "black",
size = 1.2),
axis.line.x = element_blank(),
axis.text.x = element_blank(),
panel.grid = element_blank(),
legend.position = "top",
legend.direction = "vertical",
legend.justification = c(1,0),
legend.text = element_text(size = 12)
)
p5
# return(list(p5,p3,datv,top))
p = p3
p
filename = paste("results/Difference_analysis/volcano_plot/Mui.group.volcano.pdf",sep = "")
ggplot2::ggsave(filename,p,width = 12,height = 6,limitsize = FALSE)
# The first way: microeco software package
# rm(list=ls())
# Load packages
library(tidyverse)
library(microeco)
library(magrittr)
library(ggplot2)
# Load data
otu <- read.csv("data/otu.csv", row.names = 1)
group <- read.csv("data/group.csv", row.names = 1)
tax <- read.csv("data/tax.csv", row.names = 1)
# Create objects that the microeco package can recognize
dataset <- microtable$new(sample_table = group,
otu_table = otu,
tax_table = tax)
dataset
microtable-class object:
sample_table have 80 rows and 2 columns
otu_table have 13262 rows and 80 columns
tax_table have 13296 rows and 7 columns
# LEfse analysis
lefse <- trans_diff$new(dataset = dataset,
method = "lefse",
group = "Group",
alpha = 0.05,
p_adjust_method = "fdr",
lefse_subgroup = NULL)
microtable-class object:
sample_table have 80 rows and 2 columns
otu_table have 13262 rows and 80 columns
tax_table have 13262 rows and 7 columns
# Check results
head(lefse$res_diff)
Comparison
k__Bacteria|p__Proteobacteria MG - NG - SG - YG
k__Bacteria|p__Acidobacteria MG - NG - SG - YG
k__Bacteria|p__Acidobacteria|c__Acidobacteria MG - NG - SG - YG
k__Bacteria|p__Bacteroidetes MG - NG - SG - YG
k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria MG - NG - SG - YG
k__Bacteria|p__Chloroflexi MG - NG - SG - YG
Taxa
k__Bacteria|p__Proteobacteria k__Bacteria|p__Proteobacteria
k__Bacteria|p__Acidobacteria k__Bacteria|p__Acidobacteria
k__Bacteria|p__Acidobacteria|c__Acidobacteria k__Bacteria|p__Acidobacteria|c__Acidobacteria
k__Bacteria|p__Bacteroidetes k__Bacteria|p__Bacteroidetes
k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria
k__Bacteria|p__Chloroflexi k__Bacteria|p__Chloroflexi
Method Group LDA
k__Bacteria|p__Proteobacteria LEfSe MG 4.933121
k__Bacteria|p__Acidobacteria LEfSe YG 4.799503
k__Bacteria|p__Acidobacteria|c__Acidobacteria LEfSe YG 4.761262
k__Bacteria|p__Bacteroidetes LEfSe NG 4.757968
k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria LEfSe MG 4.689141
k__Bacteria|p__Chloroflexi LEfSe YG 4.506116
P.unadj P.adj
k__Bacteria|p__Proteobacteria 9.808412e-09 1.253563e-07
k__Bacteria|p__Acidobacteria 2.181291e-07 1.660025e-06
k__Bacteria|p__Acidobacteria|c__Acidobacteria 1.309977e-06 7.461175e-06
k__Bacteria|p__Bacteroidetes 1.010392e-06 5.995157e-06
k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria 1.217453e-08 1.543416e-07
k__Bacteria|p__Chloroflexi 6.575168e-03 1.280813e-02
Significance
k__Bacteria|p__Proteobacteria ***
k__Bacteria|p__Acidobacteria ***
k__Bacteria|p__Acidobacteria|c__Acidobacteria ***
k__Bacteria|p__Bacteroidetes ***
k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria ***
k__Bacteria|p__Chloroflexi *
#write.csv(efse$res_diff,"res_diff.csv",quote = FALSE,row.names = FALSE)
# Draw a histogram of the differential features of the top 30 taxa with the highest LDA (log10)
lefse_bar <- lefse$plot_diff_bar(use_number = 1:30,
width = 0.8,
group_order = c("YG", "MG", "NG"))
lefse_bar
ggplot2::ggsave(paste("results/Biomarker_identification/LEfSe/microeco_lefse_bar02",".pdf", sep=""), lefse_bar, width=189 * 1.5, height=150 * 1.5, unit='mm')
# Display the top 200 taxa and top 50 features
p_lefse = lefse$plot_diff_cladogram(filter_taxa = 0.0001,
use_taxa_num = 200,
use_feature_num = 50,
clade_label_level = 5,
group_order = c("YG", "MG", "NG"))
ggplot2::ggsave(paste("results/Biomarker_identification/LEfSe/microeco_cladorgam01",".pdf", sep=""), p_lefse, width=189 * 1.5, height=150 * 1.5, unit='mm')
# Change details
use_labels <- c("c__Deltaproteobacteria", "c__Actinobacteria", "o__Rhizobiales", "p__Proteobacteria", "p__Bacteroidetes",
"o__Micrococcales", "p__Acidobacteria", "p__Verrucomicrobia", "p__Firmicutes",
"p__Chloroflexi", "c__Acidobacteria", "c__Gammaproteobacteria", "c__Betaproteobacteria", "c__KD4-96",
"c__Bacilli", "o__Gemmatimonadales", "f__Gemmatimonadaceae", "o__Bacillales", "o__Rhodobacterales")
# then use parameter select_show_labels to show
p_lefse = lefse$plot_diff_cladogram(use_taxa_num = 200,
use_feature_num = 50,
select_show_labels = use_labels)
ggplot2::ggsave(paste("results/Biomarker_identification/LEfSe/microeco_cladorgam02",".pdf", sep=""), p_lefse, width=189 * 1.5, height=150 * 1.5, unit='mm')
# The second method: microbiomeMarker software package
# Load packages
library(tidyverse)
library(magrittr)
# otu
otu <- read.csv("data/microbiomeMarker/otu.csv",header = TRUE,row.names = 1)
otu %<>% as.matrix()
otu %>% head()
CKA1 CKA2 CKA3 CKB1 CKB2 CKB3 CKC1 CKC2 CKC3 LGA1 LGA2 LGA3 LGB1 LGB2
ASV_1 14 14 17 13 10 19 10 4 12 14 12 13 15 15
ASV_2 12 18 9 0 0 5 13 15 13 10 9 0 7 0
ASV_3 5 10 9 12 6 8 6 0 7 7 9 14 0 9
ASV_4 27 7 0 2160 593 4277 9956 26076 16018 29 26 32 18 26
ASV_5 0 0 0 6 0 0 0 6 0 0 8 0 0 0
ASV_6 76 8 37 42 45 46 17 17 36 47 46 55 41 28
LGB3 LGC1 LGC2 LGC3 MGA1 MGA2 MGA3 MGB1 MGB2 MGB3 MGC1 MGC2 MGC3 HGA1
ASV_1 14 14 22 15 1902 2016 1818 857 1057 1043 5721 7434 6074 3988
ASV_2 14 18 19 18 3092 3163 2930 475 562 597 7135 8660 6688 59
ASV_3 13 0 8 14 1732 1771 744 1650 1702 1685 1387 2662 1587 3670
ASV_4 14 31 13 0 0 4 0 0 14 39 23 0 0 9
ASV_5 7 0 0 0 831 777 846 1457 1521 1410 1211 1417 1420 1418
ASV_6 40 31 28 14 797 858 828 1739 2246 2635 753 1408 654 1288
HGA2 HGA3 HGB1 HGB2 HGB3 HGC1 HGC2 HGC3
ASV_1 4654 3879 2739 2880 2882 4301 4583 4977
ASV_2 53 44 8876 9833 10042 183 183 202
ASV_3 1796 1736 645 661 1255 5343 3452 4157
ASV_4 0 0 12 37 14 0 0 6
ASV_5 1696 1467 487 526 471 1939 1885 2001
ASV_6 1571 1172 662 677 736 1515 1524 1976
# env and group info.
env <- read.csv("data/microbiomeMarker/env.csv",header = TRUE,row.names = 1)
env %>%
ggpubr::mutate(
sampleID = row.names(.),
treats = paste(env$grazing,env$depth,sep = "")) %>%
select(sampleID,treats,everything())-> env
env %>%
head()
sampleID treats grazing depth TN TP TK Ammonia AP OM OC
CKA1 CKA1 CKA CK A 1.410 1.11 0.70 1.48 30.40 24.2 14.03779
CKA2 CKA2 CKA CK A 1.450 1.15 0.69 1.90 29.30 20.1 11.66248
CKA3 CKA3 CKA CK A 1.470 1.16 0.66 2.04 25.94 18.3 10.59294
CKB1 CKB1 CKB CK B 1.456 1.08 0.67 1.77 35.40 23.9 13.85225
CKB2 CKB2 CKB CK B 1.448 1.28 0.72 1.99 32.38 19.2 11.12560
CKB3 CKB3 CKB CK B 1.422 1.10 0.70 2.00 27.64 19.3 11.21479
# ASV taxonomy
tax <- read.csv("data/microbiomeMarker/taxonomy.csv",header = TRUE,row.names = 1)
tax %<>% as.matrix()
tax %>% head()
Kingdom Phylum Class
ASV_1 "d__Bacteria" "p__Proteobacteria" "c__Gammaproteobacteria"
ASV_2 "d__Bacteria" "p__Proteobacteria" "c__Gammaproteobacteria"
ASV_3 "d__Bacteria" "p__Actinobacteriota" "c__Acidimicrobiia"
ASV_4 "d__Bacteria" "p__Bacteroidota" "c__Bacteroidia"
ASV_5 "d__Bacteria" "p__Proteobacteria" "c__Alphaproteobacteria"
ASV_6 "d__Bacteria" "p__Cyanobacteria" "c__Cyanobacteriia"
Order Family
ASV_1 "o__Pseudomonadales" "f__Nitrincolaceae"
ASV_2 "o__Enterobacterales" "f__Alteromonadaceae"
ASV_3 "o__Actinomarinales" "f__Actinomarinaceae"
ASV_4 "o__Bacteroidales" "f__Marinifilaceae"
ASV_5 "o__Rhodobacterales" "f__Rhodobacteraceae"
ASV_6 "o__Synechococcales" "f__Cyanobiaceae"
Genus
ASV_1 "g__Marinobacterium"
ASV_2 "g__Glaciecola"
ASV_3 "g__Candidatus_Actinomarina"
ASV_4 "g__Marinifilaceae_unclassified"
ASV_5 "g__Roseobacter"
ASV_6 "g__Synechococcus_CC9902"
Species
ASV_1 "s__Marinobacterium_unclassified"
ASV_2 "s__Glaciecola_unclassified"
ASV_3 "s__Candidatus_Actinomarina_unclassified"
ASV_4 "s__Marinifilaceae_unclassified"
ASV_5 "s__uncultured_Roseobacter_sp."
ASV_6 "s__Synechococcus_CC9902_unclassified"
## Data preparation
library(phyloseq)
physeq <- phyloseq(
otu_table(otu,taxa_are_rows = TRUE),
tax_table(tax),
sample_data(env)
)
physeq
phyloseq-class experiment-level object
otu_table() OTU Table: [ 31182 taxa and 36 samples ]
sample_data() Sample Data: [ 36 samples by 11 sample variables ]
tax_table() Taxonomy Table: [ 31182 taxa by 7 taxonomic ranks ]
# if (!requireNamespace("BiocManager", quietly = TRUE))
# install.packages("BiocManager")
#
# BiocManager::install("microbiomeMarker")
# Data standardization - leveling
library(microbiomeMarker) # 7 standardized methods。
##?normalize
set.seed(12345)
data <- microbiomeMarker::normalize(physeq, method = "rarefy")
data
phyloseq-class experiment-level object
otu_table() OTU Table: [ 25486 taxa and 36 samples ]
sample_data() Sample Data: [ 36 samples by 11 sample variables ]
tax_table() Taxonomy Table: [ 25486 taxa by 7 taxonomic ranks ]
# Data filtering to keep only the bacteria domain
data <- phyloseq::subset_taxa(
data,
Kingdom %in% c("d__Bacteria")
)
data
phyloseq-class experiment-level object
otu_table() OTU Table: [ 23865 taxa and 36 samples ]
sample_data() Sample Data: [ 36 samples by 11 sample variables ]
tax_table() Taxonomy Table: [ 23865 taxa by 7 taxonomic ranks ]
## MicrobiomeMarker for lefse analysis
# lefse analysis
# Lefse analysis - LDA threshold set to 4
args(run_lefse) # View function parameters
function (ps, group, subgroup = NULL, taxa_rank = "all", transform = c("identity",
"log10", "log10p"), norm = "CPM", norm_para = list(), kw_cutoff = 0.05,
lda_cutoff = 2, bootstrap_n = 30, bootstrap_fraction = 2/3,
wilcoxon_cutoff = 0.05, multigrp_strat = FALSE, strict = c("0",
"1", "2"), sample_min = 10, only_same_subgrp = FALSE,
curv = FALSE)
NULL
#?run_lefse
adj <- tax_table(data) %>% apply(.,2,function(x) length(unique(x))) %>% sum
# This function does not provide a difference test p-value correction parameter, so you can correct it yourself.
lefse <- run_lefse(
data,
norm = "CPM",
group = "grazing",
multigrp_strat = TRUE,
wilcoxon_cutoff = 0.001,
kw_cutoff = 0.01/adj, # bonferroni
bootstrap_n = 50, # bootstrap times
lda_cutoff = 4
)
## Extract biomarker identification results
# Taxa without annotated names will be named according to the annotation results of the previous taxa.
lefse %>% marker_table() -> res.diff
# write.csv(res.diff,"lda4_diff.csv",quote = FALSE)
# res.diff = as.data.frame(res.diff)
# write.csv(res.diff,"results/lda4_diff.csv")
dim(res.diff) # 61个biomarker。
[1] 55 5
res.diff %>% head()
feature
marker1 d__Bacteria|p__Firmicutes
marker2 d__Bacteria|p__Firmicutes|c__Clostridia
marker3 d__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Rhizobiales|f__Hyphomicrobiaceae|g__Hyphomicrobiaceae_unclassified
marker4 d__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Rhizobiales|f__Hyphomicrobiaceae|g__Hyphomicrobiaceae_unclassified|s__Hyphomicrobiaceae_unclassified
marker5 d__Bacteria|p__Proteobacteria|c__Alphaproteobacteria
marker6 d__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Alphaproteobacteria_unclassified
enrich_group ef_lda pvalue padj
marker1 CK 4.393701 9.966991e-07 9.966991e-07
marker2 CK 4.323842 1.004466e-06 1.004466e-06
marker3 CK 4.045791 1.569234e-06 1.569234e-06
marker4 CK 4.045791 1.569234e-06 1.569234e-06
marker5 HG 5.224315 1.143917e-06 1.143917e-06
marker6 HG 4.314859 1.302684e-06 1.302684e-06
## Visualization of lefse analysis results
# cladogram
cols <- RColorBrewer::brewer.pal(8, "Dark2")
pdf("results/Biomarker_identification/LEfSe/microbiomeMarker_cladogram_lda_marke_only.pdf",
width = unit(16,"cm"),
height = unit(16,"cm"),
family="Times")
plot_cladogram(
lefse,
# The number of colors needs to be consistent with the number of enrich_group classification levels.
color = cols[seq_along(res.diff$enrich_group %>% unique)],
# FALSE displays all data in the graph, TRUE indicates that only data with significant differences are displayed.
only_marker = TRUE,
branch_size = 0.2,
alpha = 0.2,
node_size_scale = 1,
node_size_offset = 1.1,
# Add the largest taxonomic unit of the branch label, and the remaining taxonomic unit annotations are used as legends. The larger the value, the higher the taxonomic unit represented.
clade_label_level = 7,
clade_label_font_size = 2,# 分支标签大小
annotation_shape = 22,
annotation_shape_size = 2,
marker_legend_param = list(
ncol = 2,
direction = "horizontal"
)
) +
theme(
plot.margin = margin(0, 0, 0, 0),
legend.position = "bottom"
)
dev.off()
png
2
## LDA effect size plot
# Bar plot
args(plot_ef_bar)
function (mm, label_level = 1, max_label_len = 60, markers = NULL)
NULL
pdf("results/Biomarker_identification/LEfSe/microbiomeMarker_ef_bar.pdf",
width = unit(16,"cm"),
height = unit(16,"cm"),
family="Times")
plot_ef_bar(lefse,max_label_len = max(str_length(res.diff$feature))) +
scale_fill_manual(values = cols[1:3])
dev.off()
png
2
## Bubble plot
args(plot_ef_dot)
function (mm, label_level = 1, max_label_len = 60, markers = NULL)
NULL
pdf("results/Biomarker_identification/LEfSe/microbiomeMarker_ef_dot.pdf",
width = unit(16,"cm"),
height = unit(16,"cm"),
family="Times")
plot_ef_dot(lefse,
max_label_len = max(str_length(res.diff$feature)))+
scale_color_manual(values = cols[1:3])
dev.off()
png
2
## Lollipop
p1 <- plot_ef_dot(lefse,
max_label_len = max(str_length(res.diff$feature)))+
scale_color_manual(values = cols[1:3])+
geom_segment(ggplot2::aes(x= 4,xend = effect_size,y = feature, yend = feature))
#p1$data
ggplot2::ggsave("results/Biomarker_identification/LEfSe/microbiomeMarker_ef_Lollipop.pdf",
p1,
device = "pdf",
width = unit(16,"cm"),
height = unit(16,"cm"),
family="Times")
## Taxonomic unit abundance difference map
args(plot_abundance)
function (mm, label_level = 1, max_label_len = 60, markers = NULL,
group)
NULL
p2 <- plot_abundance(lefse,
group = "grazing",
max_label_len = max(str_length(res.diff$feature)))+
scale_fill_manual(values = cols[1:4],breaks = c("CK","LG","MG","HG"))
## Change details
p2$data$grazing <- factor(p2$data$grazing, levels = c("CK","LG","MG","HG")) # set factor level
p2$data$abd <- log10(p2$data$abd + 1)
p2 <- p2 + labs(x = "log10(Abundance +1)")
ggplot2::ggsave("results/Biomarker_identification/LEfSe/microbiomeMarker_abundance.pdf",
p2,
device = "pdf",
width = unit(16,"cm"),
height = unit(20,"cm"),
family="Times")
## The third method: online Lefse analysis combined with local drawing beautification
library(magrittr)
library(dplyr)
library(ggplot2)
library(ggpubr)
# metacyc data (data organization)
otutable <- read.table("data/metacyc_lefse.txt",header = TRUE,row.names = 1,sep = "\t")
otutable = data.frame(otutable,stringsAsFactors = F)
otutable2 = as.data.frame(lapply(otutable, as.numeric))
otutable2$pathways = rownames(otutable)
otutable2 = otutable2[, c(21, 1:20)]
metadata <- read.table("data/group_lefse.txt", header=T)
metadata$Group <- NULL
colnames(metadata)[2] <- 'Group'
rownames(metadata) <- metadata$sample
RA <- otutable2
rownames(RA) <- RA$pathways
RA1 = RA[,2:ncol(RA)]
RA1=as.data.frame(t(RA1))
RA1$sample = rownames(RA1)
RA1 = merge(metadata,RA1,by='sample')
rownames(RA1) = RA1$sample
RA1 = RA1[,-1]
RA1=as.data.frame(t(RA1))
#write.table(RA1,file = 'results/Biomarker_identification/LEfSe/lefse_metacyc_pathways.txt',quote = FALSE,sep = "\t", row.names = T,col.names = T)
# Lefse online analysis
# lefse analysis was performed in website https://www.bioincloud.tech/task-meta
# load lefse results
lefse = read.table('data/lefse_selected.txt',header=T,sep='\t',stringsAsFactors = F)
lefse = as.data.frame(lefse)
lefse = lefse[which(lefse$EnrichedGroups != ''),]
lefse = lefse[which(lefse$LDA>=3.0),]
# When EnrichGroups are 2, one of the LDA groups takes a negative value
if(length(unique(lefse$EnrichedGroups))==2){
lefse[lefse$EnrichedGroups==unique(lefse$EnrichedGroups)[1],4] =
0 - lefse[lefse$EnrichedGroups==unique(lefse$EnrichedGroups)[1],4]
}
# Sort by LDA size when EnrichGroups are 2, and sort by size within each EnrichGroup when EnrichGroups are greater than 2
if(length(unique(lefse$EnrichedGroups))==2){
lefse=lefse[order(lefse$LDA,decreasing = T),]
} else {lefse=lefse[order(lefse$Region,lefse$Disease,lefse$LDA,decreasing = F),]}
lefse$KW_Pvalue = as.numeric(lefse$KW_Pvalue)
lefse$EnrichedGroups = factor(lefse$EnrichedGroups,levels = c('Patients','Healthy'))
lefse$Biomarkernames = factor(lefse$Biomarkernames, levels = lefse$Biomarkernames)
lefse = lefse %>%
ggpubr::mutate(EnrichedGroups = ordered(EnrichedGroups,
levels=c("Healthy","Patients")))
lefse$Biomarkernames = factor(lefse$Biomarkernames,levels = as.character(lefse$Biomarkernames))
g_metacyc_lefse <- ggplot(lefse,ggplot2::aes(x = Biomarkernames,y = LDA, fill = EnrichedGroups)) +
scale_y_continuous(limits = c(-4.5,4),breaks=seq(-4, 4, 1))+
geom_bar(stat = 'identity',colour = 'black',width = 0.8,position = position_dodge(0.7))+
xlab('') + ylab('LDA SCORE (log 10)') + coord_flip() + bgcolor("white")+
ggplot2::theme_bw() + labs(fill = "Group")+
geom_hline(yintercept = c(-4, -3, -2, -1, 0, 1, 2, 3, 4),
linetype=2,
alpha=0.6,
color='black',
lwd=0.3)+
theme(legend.position = "bottom")+
#scale_fill_manual(values = c("#00C0D8","#FF6060"))+
#scale_color_manual(values = c("#00C0D8","#FF6060"))+
scale_fill_manual(values = c("#36aecc","#975896"))+
scale_color_manual(values = c("#36aecc","#975896"))+
theme(axis.text.y = element_blank(),axis.ticks = element_blank()
) +
theme(panel.border = element_blank(), panel.grid = element_blank()) +
geom_text(ggplot2::aes(y = ifelse(lefse$LDA >0,-0.1,0.1),label=Biomarkernames),fontface=1,size=4,hjust = ifelse(lefse$LDA>0,1,0))
ggplot2::ggsave(paste("results/Biomarker_identification/LEfSe/patients_healthy_metacyc_lefse_3.0",".pdf", sep=""), g_metacyc_lefse, width=149 * 1.5, height=170 * 1.5, unit='mm')
g_metacyc_lefse
# Biomarker screening using random forest models
# rm(list=ls())
# Load packages
library(reshape2)
library(ggplot2)
library(ggprism)
library(dplyr)
library(plyr)
library(caret)
library(randomForest)
#install.packages("PRROC")
library(PRROC)
library(ROCR)
library(pROC)
library(yardstick)
library(patchwork)
library(cols4all)
library(openxlsx)
library(tidyverse)
conflicts_prefer(ggplot2::theme_classic)
conflicts_prefer(base::setdiff)
conflicts_prefer(ggplot2::theme_bw)
# Loading settings and functions
source("function/randomforest.crossvalidation.R")
# Set theme
mytheme = theme_classic() +
theme(text = element_text(family = "sans", size = 10))+
theme(#legend.position="none",
legend.text = element_text(size=8),
legend.title = element_blank(),
panel.background = element_blank(),
panel.grid = element_blank(),
axis.text.y = element_text(size=10, colour="black", family = "sans", angle = 0),
axis.text.x = element_text(size=10, colour="black", family = "sans", angle = 0, hjust = 0),
axis.title= element_text(size=12, family = "sans"),
strip.text.x = element_text(size=10, angle = 0),
strip.text.y = element_text(size=10, angle = 0),
panel.border = element_rect(colour = "black"),
plot.title = element_text(size=10, angle = 0),
strip.background.x = element_rect(fill = "#E5E4E2", colour = "black", size = 0.5),
legend.position = c(0.85, 0.65),
)+
theme(axis.text.x=element_text(angle=0,vjust=1, hjust=0.6))+
theme(axis.line = element_line(size = 0.2, colour = "black"))
# 1.Load data
# metadata
design <- read.table(file = "data/group_RF.txt", sep = "\t", header = T, row.names=1)
# 60 samples
df_species <- read.table(file = "data/species_data_RF.txt", sep = "\t", header = T, check.names = FALSE)
# sum of Species
data_species <- aggregate(.~ Species, data = df_species, sum)
rownames(data_species) = data_species$Species
data_species = data_species[, -1]
data_species_ra = apply(data_species, 2, function(x) x/100)
# Screening microbial species prevalence > 5%
zero_counts <- vector("integer", nrow(data_species))
for (i in 1:nrow(data_species)) {
count <- 0
for (j in 1:ncol(data_species)) {
if (data_species[i, j] == 0) {
count <- count + 1
}
}
zero_counts[i] <- count
}
# Output
zero_count = as.data.frame(zero_counts)
data_species2 = data_species
data_species2$zero_counts = zero_count$zero_counts
data_species2$all_counts = 60
data_species2$sample_percent = round(1-data_species2$zero_counts/data_species2$all_counts, 6)
data_species3 = data_species2 %>% filter(data_species2$sample_percent >= 0.05)
data_species3 = data_species3[, -c(61, 62, 63)]
# Among the bacteria that account for more than 5% of the sample, check whether the corresponding bacterial abundance in each sample exceeds 0.01%, and select bacteria with a relative abundance exceeding 0.01%.
data_species3 = apply(data_species3, 2, function(x) x/sum(x))
data_species3 = as.data.frame(data_species3)
count_t_values = apply(data_species3, 1, function(x)sum(x>=0.0001))
count_t_values = as.data.frame(count_t_values)
data_species3$count_t_values = count_t_values$count_t_values
data_species3$all_counts = 60
data_species3$t_percent = round(data_species3$count_t_values/data_species3$all_counts, 6)
data_species4 = data_species3 %>% filter(data_species3$t_percent >= 0.05)
data_species4 = data_species4[, -c(61, 62, 63)]
# The data were first log10 transformed
data_species5 = log10(data_species4 + 1e-05)
## 2.Data split
# z-score normalization
data_species6 = apply(data_species5, 1, function(x){
return((x-mean(x))/sd(x))
})
data_species6 = t(data_species6)
#write.csv(data6, "results/rf_model_species_used.csv")
# Select the previously deduplicated data for analysis
otutab = data_species6
design2 = design
# Select by manual set group
if (TRUE){
sub_design = subset(design2, Group %in% c("Patients","Control"))
sub_design$group = factor(sub_design$Group, levels=c("Patients","Control"))
}
idx = rownames(sub_design) %in% colnames(otutab)
sub_design = sub_design[idx,]
sub_otutab = otutab[,rownames(sub_design)]
# Create data partition
# Divide the data into training set and test set. # Here, the training set and test set are divided in a ratio of 7:3. 70% of the 60 samples are about 42, and the remaining 18 samples account for about 30%.
otutab_t_species = as.data.frame(t(sub_otutab))
# Set classification info.
otutab_t_species$group = factor(sub_design$Group, levels = c("Patients","Control"))
otutab_t_species = na.omit(otutab_t_species)
row.name = rownames(otutab_t_species)
# 60 samples
set.seed = 515
sam.row.name = sample(row.name, 42, replace = FALSE)
train_data_species = otutab_t_species[sam.row.name, ]
unique_rows_df1 <- setdiff(rownames(otutab_t_species), rownames(train_data_species))
test_data_species <- otutab_t_species[unique_rows_df1, ]
#test_data_species = setdiff(otutab_t_species, train_data_species)
## 3.Model training
# load data
dat1_species <- train_data_species
conf_species <- as.data.frame(dat1_species$group)
rownames(conf_species) <- rownames(dat1_species)
colnames(conf_species) <- "Group"
conf_species$sample <- rownames(conf_species)
conf_species <- as.data.frame(conf_species)
dat2_species <- dat1_species
conf2_species <- conf_species
conf2_species$Group = as.factor(as.character(conf2_species$Group))
outcome_species = conf2_species$Group
outcome_species <- sub("Control","0",outcome_species)
outcome_species <- sub("Patients","1",outcome_species)
outcome_species <-as.factor(outcome_species)
dat_species <- dat2_species
X_species <- as.data.frame(dat_species)
X_species$outcome_species = outcome_species
X_species <- X_species[, -612]
## 5*10_crossvalidation
set.seed(999)
result_species <- replicate(5, rfcv1(X_species[,-ncol(X_species)],
X_species$outcome_species,
cv.fold=10,step=0.9), simplify=FALSE)
error.cv <- sapply(result_species, "[[", "error.cv")
matplot(result_species[[1]]$n.var, cbind(rowMeans(error.cv), error.cv), type="l",
lwd=c(2, rep(1, ncol(error.cv))), col=1, lty=1, log="x",
xlab="Number of variables", ylab="CV Error")
error.cv.cbm <- cbind(rowMeans(error.cv), error.cv)
cutoff <- min (error.cv.cbm[,1])+sd(error.cv.cbm[,1])
error.cv.cbm[error.cv.cbm[,1] < cutoff,]
[,1] [,2] [,3] [,4] [,5] [,6]
611 0.02380952 0.02380952 0.02380952 0.02380952 0.02380952 0.02380952
550 0.03333333 0.02380952 0.04761905 0.02380952 0.04761905 0.02380952
495 0.02857143 0.02380952 0.04761905 0.02380952 0.02380952 0.02380952
445 0.02380952 0.02380952 0.02380952 0.02380952 0.02380952 0.02380952
401 0.02857143 0.02380952 0.04761905 0.02380952 0.02380952 0.02380952
361 0.02857143 0.02380952 0.04761905 0.02380952 0.02380952 0.02380952
325 0.02380952 0.02380952 0.02380952 0.02380952 0.02380952 0.02380952
292 0.02380952 0.02380952 0.02380952 0.02380952 0.02380952 0.02380952
263 0.02380952 0.02380952 0.02380952 0.02380952 0.02380952 0.02380952
237 0.02857143 0.04761905 0.02380952 0.02380952 0.02380952 0.02380952
213 0.02857143 0.02380952 0.04761905 0.02380952 0.02380952 0.02380952
192 0.02380952 0.02380952 0.02380952 0.02380952 0.02380952 0.02380952
173 0.02380952 0.02380952 0.02380952 0.02380952 0.02380952 0.02380952
140 0.02857143 0.02380952 0.02380952 0.02380952 0.02380952 0.04761905
126 0.02380952 0.02380952 0.02380952 0.02380952 0.02380952 0.02380952
113 0.03333333 0.02380952 0.04761905 0.02380952 0.02380952 0.04761905
102 0.03333333 0.02380952 0.04761905 0.02380952 0.04761905 0.02380952
92 0.02380952 0.02380952 0.02380952 0.02380952 0.02380952 0.02380952
83 0.03333333 0.02380952 0.04761905 0.02380952 0.04761905 0.02380952
74 0.02380952 0.02380952 0.02380952 0.02380952 0.02380952 0.02380952
67 0.02857143 0.02380952 0.04761905 0.02380952 0.02380952 0.02380952
60 0.03333333 0.02380952 0.04761905 0.02380952 0.04761905 0.02380952
54 0.02857143 0.02380952 0.04761905 0.02380952 0.02380952 0.02380952
39 0.03333333 0.02380952 0.04761905 0.02380952 0.04761905 0.02380952
36 0.02857143 0.02380952 0.04761905 0.02380952 0.02380952 0.02380952
26 0.03333333 0.02380952 0.04761905 0.02380952 0.04761905 0.02380952
23 0.03333333 0.02380952 0.04761905 0.02380952 0.04761905 0.02380952
12 0.03333333 0.02380952 0.04761905 0.02380952 0.02380952 0.04761905
11 0.03333333 0.02380952 0.04761905 0.02380952 0.02380952 0.04761905
10 0.02380952 0.02380952 0.02380952 0.02380952 0.02380952 0.02380952
9 0.02380952 0.02380952 0.02380952 0.02380952 0.02380952 0.02380952
8 0.03333333 0.07142857 0.02380952 0.02380952 0.02380952 0.02380952
4 0.03333333 0.02380952 0.07142857 0.02380952 0.02380952 0.02380952
3 0.02380952 0.02380952 0.02380952 0.02380952 0.02380952 0.02380952
abline(v=6,col="pink",lwd=2)
optimal = 6
error.cv.cbm2 <- as.data.frame(error.cv.cbm)
error.cv.cbm2$num <- rownames(error.cv.cbm2)
n.var = error.cv.cbm2$num
n.var = as.numeric(n.var)
error.cv = error.cv.cbm2[,1:5]
colnames(error.cv) = paste('err',1:5,sep='.')
err.mean = apply(error.cv,1,mean)
allerr = data.frame(num=n.var,err.mean=err.mean,error.cv)
allerr = as.data.frame(allerr)
write.table(allerr, file = "results/Biomarker_identification/Machine_learning/Species_rfcv_5_10_new.txt",
sep = "\t", quote = F, row.names = T, col.names = T)
conflicts_prefer(ggplot2::aes)
conflicts_prefer(ggplot2::annotate)
conflicts_prefer(ggplot2::ggsave)
allerr <- read.table(file = "results/Biomarker_identification/Machine_learning/Species_rfcv_5_10_new.txt",
sep = "\t", header = T, row.names=1)
mytheme3 = theme_bw() + theme(text = element_text(family = "sans", size = 7))+
theme(legend.position="none",
legend.text = element_text(size=14),
legend.title = element_blank(),
panel.background = element_blank(),
panel.grid = element_blank(),
axis.text.y = element_text(size=14, colour="black", family = "sans", angle = 0),
axis.text.x = element_text(size=14, colour="black", family = "sans", angle = 0, hjust = 0),
axis.title= element_text(size=14),
strip.text.x = element_text(size=14, angle = 0),
strip.text.y = element_text(size=14, angle = 0),
plot.title = element_text(size=14, angle = 0),
strip.background.x = element_rect(fill = "#E5E4E2", colour = "black", size = 0.2))+
theme(axis.text.x=element_text(angle=0,vjust=1, hjust=0.6))+
theme(axis.line = element_line(size = 0.1, colour = "black"))
p01_species = ggplot(allerr, aes(x=allerr$num)) +
geom_line(data = allerr, aes(x = allerr$num, y = allerr$err.1), colour = 'grey') +
geom_line(data = allerr, aes(x = allerr$num, y = allerr$err.2), colour = 'grey') +
geom_line(data = allerr, aes(x = allerr$num, y = allerr$err.3), colour = 'grey') +
geom_line(data = allerr, aes(x = allerr$num, y = allerr$err.4), colour = 'grey') +
geom_line(data = allerr, aes(x = allerr$num, y = allerr$err.5), colour = 'grey') +
geom_line(data = allerr, aes(x = allerr$num, y = allerr$err.mean), colour = 'black') +
geom_vline(xintercept = optimal, colour='black', lwd=0.36, linetype="dashed") +
geom_hline(yintercept = 0.05976941, colour='black', lwd=0.36, linetype="dashed") +
mytheme3+
coord_trans(x = "log2") +
scale_x_continuous(breaks = c(10, 30, 50, 100, 200, 400)) + # , max(allerr$num)
labs(#title=paste('Training set (n = ', dim(train_data_species)[1],')', sep = ''),
x='Number of species ', y='Cross-validation error rate') +
annotate("text", x = optimal, y = max(allerr$err.mean), label=paste("optimal = ", optimal, sep="")) +
#main_theme+
theme_bw() + theme(panel.background = element_blank(),
panel.grid.major =element_blank(),
panel.grid.minor = element_blank(),
legend.position = "none",
axis.title= element_text(size=10, family = "sans"))
ggsave("results/Biomarker_identification/Machine_learning/Species_rfcv_5_10_top6.pdf",
p01_species,width = 5,height = 3.2)
p01_species
#####pick 32 marker by corossvalidation#######
k=1
b <- matrix(0,ncol=611,nrow=50)
for(i in 1:5){
for(j in 1:10){
b[k,]<-result_species[[i]]$res[[j]]
k=k+1
}}
mlg.list<-b[,1:6]
list<-c()
k=1
for(i in 1:6){
for(j in 1:50){
list[k]<-mlg.list[j,i]
k=k+1
}}
mlg.sort<-as.matrix(table(list))
mlg.sort<-mlg.sort[rev(order(mlg.sort[,1])),]
pick_species<- as.numeric(names(head(mlg.sort,6)))
tmp= X_species[,-ncol( X_species)]
mlg.pick.species<-colnames(tmp)[pick_species]
write.table(mlg.pick.species,"results/Biomarker_identification/Machine_learning/cross_validation_pick_6_in_species.txt",
sep="\t",quote=F)
## train.set
## Comparison of the probability of disease prediction between the disease group and the healthy control group
train1_species <- X_species[,c(pick_species,612)]
train1_species <-data.frame(train1_species)
set.seed(32)
train1.rf_species <- randomForest(outcome_species~., data =train1_species,
importance = TRUE)
train1.pre_species <- predict(train1.rf_species,type="prob")
p.train_species <- train1.pre_species[,2]
#boxplot(p.train~outcome,col=c(3,4),main="Probability of Patients")
write.table(p.train_species,"results/Biomarker_identification/Machine_learning/species.cross_validation.6makr.predict.in.train.txt",
sep="\t",quote=F)
train1_pre2_species <- data.frame(outcome_species, p.train_species)
train1_pre2_species$outcome_species <- as.factor(train1_pre2_species$outcome_species)
train1_pre2_species$outcome_species <- sub("0","Healthy",train1_pre2_species$outcome_species)
train1_pre2_species$outcome_species <- sub("1","Patients",train1_pre2_species$outcome_species)
compaired = list(c("Healthy", "Patients"))
library(ggsignif)
library(scales)
compaired2 = list(c("Healthy", "Patients"))
train1_pre2_species <- read.table(file = "results/Biomarker_identification/Machine_learning/species.cross_validation.6makr.predict.in.train_box.txt",
sep = "\t", header = T, row.names=1)
p02_species <- ggplot(train1_pre2_species, aes(x=outcome_species, y=p.train_species, fill=outcome_species)) +
geom_boxplot(position=position_dodge(width =0.4),width=0.5, size = 0.4,
fill = "transparent",
outlier.shape = NA,
linetype = "dashed")+
#theme_bw()+
theme_classic()+
labs(x = NULL, y = "Probability of Patients", color = outcome_species)+
geom_jitter(aes(color=outcome_species),position = position_jitter(0.15),
size = 0.3, alpha = 1)+
stat_boxplot(geom = "errorbar",aes(ymin=..ymax..),
width=0.18,color="black",size = 0.4)+
stat_boxplot(geom = "errorbar",aes(ymax=..ymin..),
width=0.18,color="black",size = 0.4)+
stat_boxplot(aes(ymin=..lower..,ymax=..upper.., fill=outcome_species), color="black",
fill = "transparent",position=position_dodge(width =0.4),
width=0.5, size = 0.4,outlier.shape = NA)+
geom_signif(comparisons = compaired2, step_increase = 0.3, map_signif_level = F,
test = wilcox.test, color = "black", size = 0.2, textsize = 3)+
scale_y_continuous(labels = label_number(accuracy = 0.1)) +
scale_fill_manual(values = c("#46a9cb","#945893"))+
scale_color_manual(values = c("#46a9cb","#945893"))+
theme(panel.background = element_blank(), panel.grid.major =element_blank(),
panel.grid.minor = element_blank(), legend.position = "none",
axis.text = element_text(size=10, family = "sans"),
axis.title= element_text(size=10, family = "sans"),
text = element_text(family = "sans", size = 10))
ggsave(paste("results/Biomarker_identification/Machine_learning/Species_6markers_patients_healthy_boxplot2",".pdf", sep=""),
p02_species, width=69 * 1.5, height=80 * 1.5, unit='mm')
p02_species
# Mean Decrease Accuracy refers to evaluating the importance of each feature in a random forest by calculating the importance of the feature. The importance is calculated based on the reduced accuracy of each feature point before and after randomization in each decision tree of the random forest. Mean decrease accuracy is an effective method for selecting feature importance, which can help us screen out the most important features in various machine learning problems.
varImpPlot(train1.rf_species, main = "Top feature importance", n.var = 6)
write.table(train1.rf_species$confusion, file = "results/Biomarker_identification/Machine_learning/Species_confusion_rf2.txt",
sep = "\t", quote = F, row.names = T, col.names = T)
imp_species = as.data.frame(round(importance(train1.rf_species), 2))
imp_species = imp_species[order(imp_species$MeanDecreaseAccuracy, decreasing = F),]
write.table(imp_species, file = "results/Biomarker_identification/Machine_learning/Species_imp_rf2.txt",
sep = "\t", quote = F, row.names = T, col.names = T)
# Phylum
system("awk 'NR==FNR{a[$8]=$3} NR>FNR{print $0\"\t\"a[$1]}' data/taxonomy_RF.txt results/Biomarker_identification/Machine_learning/Species_imp_rf2.txt | sed '1 s/$/Phylum/' > results/Biomarker_identification/Machine_learning/Species_imp_phylum_rf2.txt")
[1] 2
# Family
system("awk 'NR==FNR{a[$8]=$6} NR>FNR{print $0\"\t\"a[$1]}' data/taxonomy_RF.txt results/Biomarker_identification/Machine_learning/Species_imp_rf2.txt | sed '1 s/$/Phylum/' > results/Biomarker_identification/Machine_learning/Species_imp_family_rf.txt")
[1] 2
# Bar plot (Phylum)
imp_species = read.table("results/Biomarker_identification/Machine_learning/Species_imp_phylum_rf2.txt",
header=T, row.names= 1, sep="\t")
imp_species = tail(imp_species, n = optimal)
imp_species$Species = factor(rownames(imp_species), levels = rownames(imp_species))
p03_species = ggplot(imp_species, aes(x = Species, y = MeanDecreaseAccuracy, fill = Phylum)) +
geom_bar(stat = "identity") + theme_classic()+
#scale_fill_manual(values = c("#63B8FF","orange","#4AB3AA", "#D10640"))+
# scale_color_manual(values = c("#63B8FF", "orange","#4AB3AA","#D10640"))+
scale_fill_manual(values = c("#63B8FF","#4AB3AA", "#D10640"))+
scale_color_manual(values = c("#63B8FF", "#4AB3AA","#D10640"))+
coord_flip() + #main_theme+
theme(legend.position = c(0.85,0.8))+
scale_y_continuous(expand = c(0,0))+
labs(y = "Mean Decrease Accuracy", x = "Species")
ggsave(paste("results/Biomarker_identification/Machine_learning/Species_top_feautre_top6markers_phylum",
".pdf", sep=""), p03_species, width=119 * 1.5, height=80 * 1.5, unit='mm')
p03_species
# Bar plot (Family)
imp_species = read.table("results/Biomarker_identification/Machine_learning/Species_imp_family_rf2.txt",
header=T, row.names= 1, sep="\t")
imp_species = tail(imp_species, n = optimal)
imp_species$Species = factor(rownames(imp_species), levels = rownames(imp_species))
p04_species = ggplot(imp_species, aes(x = Species, y = MeanDecreaseAccuracy, fill = Family)) +
geom_bar(stat = "identity") + theme_classic()+
coord_flip() + #main_theme+
scale_fill_manual(values = c("#d2da93","#5196d5","#00ceff","#ff630d","#9b82e1",
"#e5acd7","#36999d","#ec8181","#dfc6a5","#e50719",
"#d27e43","#8a4984","#fe5094","#8d342e","#f94e54",
"#ffad00","#36999d","#00fc8d","#b64aa0","#9b82e1"))+
scale_y_continuous(expand = c(0,0))+
labs(y = "Mean Decrease Accuracy", x = "Species")
ggsave(paste("results/Biomarker_identification/Machine_learning/Species_top_feautre_top6markers_family",".pdf", sep=""),
p04_species, width=119 * 1.5, height=70 * 1.5, unit='mm')
p04_species
## 4.Train set ROC curve
# ROC in train set
roc1_species_train <- roc(outcome_species, p.train_species,
ci=TRUE, boot.n=100, ci.alpha=0.9, stratified=FALSE,
plot=TRUE#, percent=roc1$percent,col=2
)
# Get AUROC mean and confidence interval
auc_species_train = round(roc1_species_train$auc,3)
roc1_species_train2 <- plot.roc(outcome_species, p.train_species, ci=TRUE, print.auc=TRUE)
ci_low_species_train = round(roc1_species_train2$ci[1], 3)
ci_high_species_train = round(roc1_species_train2$ci[3], 3)
# Calculate 95% confidence interval
roc.list <- list(roc1_species_train)
ci.list <- lapply(roc.list, ci.se, specificities = seq(0, 1, l = 25))
ciobj02 <- ci.se(roc1_species_train, # CI of sensitivity, random forest
specificities=seq(0, 1, 0.01)) # over a select set of specificities
# Put multiple ROC confidence interval upper and lower thresholds into a data frame
ciobj3 <- as.data.frame(ciobj02)
dat.ci.list <- lapply(ci.list, function(ciobj3)
data.frame(x = as.numeric(rownames(ciobj3)),
lower = ciobj3[, 1],
upper = ciobj3[, 3]))
# Plot
p1_species_train <- ggroc(roc.list, legacy.axes = TRUE) + #theme_minimal() +
theme_bw()+ coord_equal()+
theme(panel.background = element_blank(),
panel.grid.major =element_blank(),
panel.grid.minor = element_blank(),
#axis.title= element_text(size=10, family = "sans"),
#plot.title = element_text(size = 10, family = "sans", hjust = 0.5),
#text = element_text(family = "sans", size = 10),
legend.position = "none")+
geom_abline(slope=1, intercept = 0, linetype = "dashed", alpha=0.5, color = "grey") +
coord_fixed(ratio = 0.9)+ ggtitle("Training set (n = 42)")+
geom_line(size = 0.8)+labs(x = "1 - Specificity", y = "Sensitivity")+
annotate("text", x = 0.77, y = 0.18, label = paste0("AUC = ", auc_species_train), size = 3)+
annotate("text", x = 0.77, y = 0.08, label = paste0("CI = ", ci_low_species_train, "-", ci_high_species_train), size = 3)+
scale_color_manual(values=c("#CD3278"))
col.list = list("#CD3278")
# Add confidence interval
for(i in 1:1) {
p1_species_train <- p1_species_train + geom_ribbon(
data = dat.ci.list[[i]],
aes(x = 1-x, ymin = lower, ymax = upper),
fill = col.list[[i]],
alpha = 0.3,
inherit.aes = F)
}
#ggsave(paste("results/Biomarker_identification/Machine_learning/selected_6_species_model_auroc_train_set",".pdf", sep=""), p1_species_train, width=109 * 1.5, height=60 * 1.5, unit='mm')
p1_species_train
## 5.Test set ROC curve
# ROC in test set
#dat3_species_test <- test_data_species[,c(pick_species, 402)]
dat3_species_test <- test_data_species
dat3_species_test <- data.frame(dat3_species_test)
set.seed(32)
test_species <- predict(train1.rf_species, dat3_species_test, type="prob")
conf3_species_test <- as.data.frame(dat3_species_test$group)
rownames(conf3_species_test) <- rownames(dat3_species_test)
colnames(conf3_species_test) <- "Group"
conf3_species_test$sample <- rownames(conf3_species_test)
conflicts_prefer(base::intersect)
rN.test <- rownames(test_species)
rN.test <- sub("X","",rN.test)
rN.conf <- rownames(conf3_species_test)
gid <- intersect(rN.test ,rN.conf)
test_species <- test_species[pmatch(gid, rN.test), ]
conf3_species_test <- conf3_species_test[pmatch(gid, rN.conf), ]
write.table(test_species[, 2],"results/Biomarker_identification/Machine_learning/species.cross_validation.6makr.predict.in.test.txt",
sep="\t",quote=F)
compaired2 = list(c("Healthy", "Patients"))
test1_pre2_species <- read.table(file = "results/Biomarker_identification/Machine_learning/species.cross_validation.6makr.predict.in.test_box.txt",
sep = "\t", header = T, row.names=1)
p02_species_test <- ggplot(test1_pre2_species, aes(x=outcome_species, y=p.test_species, fill=outcome_species)) +
geom_boxplot(position=position_dodge(width =0.4),width=0.5, size = 0.4,
fill = "transparent",
outlier.shape = NA,
linetype = "dashed")+
#theme_bw()+
theme_classic()+
labs(x = NULL, y = "Probability of Patients", color = outcome_species)+
geom_jitter(aes(color=outcome_species),position = position_jitter(0.15),
size = 0.3, alpha = 1)+
stat_boxplot(geom = "errorbar",aes(ymin=..ymax..),
width=0.18,color="black",size = 0.4)+
stat_boxplot(geom = "errorbar",aes(ymax=..ymin..),
width=0.18,color="black",size = 0.4)+
stat_boxplot(aes(ymin=..lower..,ymax=..upper.., fill=outcome_species), color="black",
fill = "transparent",position=position_dodge(width =0.4),
width=0.5, size = 0.4,outlier.shape = NA)+
geom_signif(comparisons = compaired2, step_increase = 0.3, map_signif_level = F,
test = wilcox.test, color = "black", size = 0.2, textsize = 3)+
scale_y_continuous(labels = label_number(accuracy = 0.1)) +
scale_fill_manual(values = c("#46a9cb","#945893"))+
scale_color_manual(values = c("#46a9cb","#945893"))+
theme(panel.background = element_blank(), panel.grid.major =element_blank(),
panel.grid.minor = element_blank(), legend.position = "none",
axis.text = element_text(size=10, family = "sans"),
axis.title= element_text(size=10, family = "sans"),
text = element_text(family = "sans", size = 10))
ggsave(paste("results/Biomarker_identification/Machine_learning/Species_6markers_patients_healthy_boxplot_test2",
".pdf", sep=""), p02_species_test, width=69 * 1.5, height=80 * 1.5, unit='mm')
p02_species_test
########test.ROC##########
outcome_species_test = conf3_species_test$Group
outcome_species_test <- sub("Control","0",outcome_species_test)
outcome_species_test <- sub("Patients", "1", outcome_species_test)
roc1_species_test <- roc(outcome_species_test, test_species[,2],
ci=TRUE, boot.n=100, ci.alpha=0.9, stratified=FALSE,
plot=TRUE#, percent=roc1$percent,col=2
)
# AUROC
roc_species_test2 = roc(outcome_species_test, test_species[,2])
# roc_species_test2
# Get AUROC mean and confidence interval
auc_species_test = round(roc_species_test2$auc,3)
roc_species_test2_2 <- plot.roc(outcome_species_test, test_species[, 2],
ci=TRUE, print.auc=TRUE)
ci_low_species_test = round(roc_species_test2_2$ci[1], 3)
ci_high_sepcies_test = round(roc_species_test2_2$ci[3], 3)
# Calculate 95% confidence interval
roc.list <- list(roc_species_test2)
ci.list <- lapply(roc.list, ci.se, specificities = seq(0, 1, l = 25))
ciobj02 <- ci.se(roc_species_test2, # CI of sensitivity, random forest
specificities=seq(0, 1, 0.01)) # over a select set of specificities
# Put multiple ROC confidence interval upper and lower thresholds into a data frame
ciobj3 <- as.data.frame(ciobj02)
dat.ci.list <- lapply(ci.list, function(ciobj3)
data.frame(x = as.numeric(rownames(ciobj3)),
lower = ciobj3[, 1],
upper = ciobj3[, 3]))
# Plot
p1_species_test <- ggroc(roc.list, legacy.axes = TRUE) + #theme_minimal() +
theme_bw()+
theme(panel.background = element_blank(),
panel.grid.major =element_blank(),
panel.grid.minor = element_blank(),
legend.position = "none") + coord_equal() + coord_fixed(ratio = 0.9)+
geom_abline(slope=1, intercept = 0, linetype = "dashed", alpha=0.5, color = "grey") +
ggtitle("Testing set (n = 18)")+
geom_line(size = 0.8)+labs(x = "1 - Specificity", y = "Sensitivity")+
annotate("text", x = 0.77, y = 0.18, label = paste0("AUC = ", auc_species_test), size = 3)+
annotate("text", x = 0.77, y = 0.08, label = paste0("CI = ", ci_low_species_test, "-", ci_high_sepcies_test), size = 3)+
scale_color_manual(values=c("#CD3278"))
col.list = list("#CD3278")
# Add confidence intervals
for(i in 1:1) {
p1_species_test <- p1_species_test + geom_ribbon(
data = dat.ci.list[[i]],
aes(x = 1-x, ymin = lower, ymax = upper),
#fill = i + 1,
fill = col.list[[i]],
alpha = 0.3,
inherit.aes = F)
}
#ggsave(paste("results/Biomarker_identification/Machine_learning/all_selected_6_species_model_auroc_test_set_new",".pdf", sep=""), p1_species_test, width=109 * 1.5, height=60 * 1.5, unit='mm')
p1_species_test
p_species_all = p1_species_train + p1_species_test
ggsave(paste("results/Biomarker_identification/Machine_learning/all_selected_6_species_model_auroc_both_set_new",".pdf", sep=""),
p_species_all, width=109 * 1.5, height=60 * 1.5, unit='mm')
p_species_all
library(patchwork)
p_abc <- p01_species + p02_species + p02_species_test
p_abc
ggsave(paste("results/Biomarker_identification/Machine_learning/p_abc1",".pdf",
sep=""), p_abc, width=169 * 1.5, height=60 * 1.5, unit='mm')
p_abc2 <- p01_species | p02_species + p02_species_test
p_abc2
ggsave(paste("results/Biomarker_identification/Machine_learning/p_abc2",".pdf",
sep=""), p_abc2, width=140 * 1.5, height=60 * 1.5, unit='mm')
## 6.Train and test set ROC curve combination
# Train and test set
roc.list01 <- list(roc1_species_train, roc_species_test2)
ci.list01 <- lapply(roc.list01, ci.se, specificities = seq(0, 1, l = 25))
ciobj01 <- ci.se(roc1_species_train, # CI of sensitivity, random forest
specificities=seq(0, 1, 0.01)) # over a select set of specificities
ciobj02 <- ci.se(roc_species_test2,
specificities=seq(0, 1, 0.01))
# Put multiple ROC confidence interval upper and lower thresholds into a data frame
ciobj3 <- as.data.frame(ciobj01, ciobj02)
dat.ci.list <- lapply(ci.list01, function(ciobj3)
data.frame(x = as.numeric(rownames(ciobj3)),
lower = ciobj3[, 1],
upper = ciobj3[, 3]))
# Plot
p_all_species <- ggroc(roc.list01, legacy.axes = TRUE) + #theme_minimal() +
theme_bw()+
theme(panel.background = element_blank(),
panel.grid.major =element_blank(),
panel.grid.minor = element_blank())+
geom_abline(slope=1, intercept = 0, linetype = "dashed", alpha=0.5, color = "grey") + coord_equal()+
theme(legend.position = c(0.70, 0.17))+coord_fixed(ratio = 0.9)+
#ggtitle("Train set")+
geom_line(size = 0.6)+labs(x = "1 - Specificity", y = "Sensitivity")+
scale_color_manual(values=c("#5ebcc2","#d274ae","#849c4c"),#"#2d8c46",
name= "",
labels = c("Training set (AUC = 0.933 (CI = 0.978-1.000))", "Testing set (AUC = 1.000 (CI = 1.000-1.000))")
)
col.list = list("#5ebcc2","#d274ae","#849c4c")#"#2d8c46",
# Add confidence intervals
for(i in 1:2) {
p_all_species <- p_all_species + geom_ribbon(
data = dat.ci.list[[i]],
aes(x = 1-x, ymin = lower, ymax = upper),
fill = col.list[[i]],
alpha = 0.2,
inherit.aes = F)
}
ggsave(paste("results/Biomarker_identification/Machine_learning/all_selected_6_species_model_auroc_both_set_new02",
".pdf", sep=""), p_all_species, width=75 * 1.5, height=60 * 1.5, unit='mm')
p_all_species
## RF models with single species
# Select 32 top important species to run random forest models
# Data preparation
top_32 <- imp_species[1:6, ]
top_32_s <- data_species6[rownames(data_species6) %in% rownames(top_32), ]
top_32_s <- as.data.frame(t(top_32_s))
train_data_t32_2 <- top_32_s[rownames(top_32_s) %in% rownames(train_data_species), ]
train_data_t32_2 = as.data.frame(train_data_t32_2)
train_data_t32_2$group = rownames(train_data_t32_2)
train_data_t32_2$group = as.character(train_data_t32_2$group)
train_data_t32_2$group = gsub("[0-9]","", train_data_t32_2$group)
test_data_t32_2 <- top_32_s[rownames(top_32_s) %in% rownames(test_data_species), ]
test_data_t32_2 = as.data.frame(test_data_t32_2)
test_data_t32_2$group = rownames(test_data_t32_2)
test_data_t32_2$group = as.character(test_data_t32_2$group)
test_data_t32_2$group = gsub("[0-9]","", test_data_t32_2$group)
## Single species model predictions
# training set or testing set
library(ggpubr)
ROC_Single_Species_train <- function(species01_train, species01_test){
species01_train$group = factor(species01_train$group, levels = c("Patients","Healthy"))
set.seed(999)
species01_train <-data.frame(species01_train)
otutab_t.rf_species01 <- randomForest(group~., data =species01_train,
importance = TRUE)
train1.pre_species <- predict(otutab_t.rf_species01,type="prob")
p.train_species <- train1.pre_species[,2]
roc1_species_train <- roc(species01_train$group, p.train_species,
ci=TRUE, boot.n=100, ci.alpha=0.9, stratified=FALSE,
plot=TRUE
)
# get average values of AUROC and confidence intervals
auc_species_train = round(roc1_species_train$auc,3)
roc1_species_train2 <- plot.roc(species01_train$group, p.train_species, ci=TRUE, print.auc=TRUE)
ci_low_species_train = round(roc1_species_train2$ci[1], 3)
ci_high_species_train = round(roc1_species_train2$ci[3], 3)
# calculate 95% confidence intervals
roc.list_top30 <- list(roc1_species_train)
ci.list <- lapply(roc.list_top30, ci.se, specificities = seq(0, 1, l = 25))
ciobj02 <- ci.se(roc1_species_train,
specificities=seq(0, 1, 0.01))
# confidence intervals
ciobj3 <- as.data.frame(ciobj02)
dat.ci.list <- lapply(ci.list, function(ciobj3)
data.frame(x = as.numeric(rownames(ciobj3)),
lower = ciobj3[, 1],
upper = ciobj3[, 3]))
# plot
p3_s01 <- ggroc(roc.list_top30, legacy.axes = TRUE) +
theme_bw()+
theme(panel.background = element_blank(),
panel.grid.major =element_blank(),
panel.grid.minor = element_blank(),
axis.title= element_text(size=12, family = "sans"),
axis.text.x = element_text(size = 12, family = "sans"),
axis.text.y = element_text(size = 12, family = "sans"),
text = element_text(family = "sans", size = 12),
legend.position = "none") + coord_equal() + coord_fixed(ratio = 0.9)+
geom_abline(slope=1, intercept = 0, linetype = "dashed", alpha=0.5, color = "grey") +
geom_line(size = 0.3)+labs(x = "1 - Specificity", y = "Sensitivity")+
ggtitle(colnames(species01_train)[1])+
annotate("text", x = 0.75, y = 0.17, label = paste0("AUC = ", auc_species_train), size = 4)+
annotate("text", x = 0.75, y = 0.1, label = paste0("CI = ", ci_low_species_train, "-", ci_high_species_train), size = 4)+
scale_color_manual(values=c("#74add1"))
col.list = list("#74add1")
# add confidence intervals
for(i in 1:1) {
p3_s01 <- p3_s01 + geom_ribbon(
data = dat.ci.list[[i]],
aes(x = 1-x, ymin = lower, ymax = upper),
fill = col.list[[i]],
alpha = 0.3,
inherit.aes = F)
}
p3_s01
}
# create a list for ROC plots
plots <- list()
capture.output(suppressWarnings(for(i in 1: 6){
species01 <- train_data_t32_2[, c(i,7)]
p01 <- ROC_Single_Species_train(species01)
plots[[i]] <- p01
}))
character(0)
# combine plots
conflicts_prefer(ggpubr::ggarrange)
final_plot <- ggarrange(plotlist = plots, nrow = 2, ncol = 3)
# save plots
pdf("results/Biomarker_identification/Machine_learning/ROC_single_species_train_data_top6_new4.pdf",
width = 12, height = 10)
final_plot
dev.off()
png
2
final_plot
SparCC Co-abudance network using R software
# 1. Data preparation
# SparCC correlations
# load packages
library(reshape2)
library(ggplot2)
library(ggprism)
library(dplyr)
library(plyr)
library(igraph)
# metadata
design <- read.table(file = "data/group.txt", sep = "\t", header = T, row.names=1)
# Species data
df3 <- read.table(file = "data/species_data.txt", sep = "\t", header = T, check.names = FALSE)
# sum of Species
data<-aggregate(.~ Species,data=df3,sum)
rownames(data) = data$Species
data = data[, -1]
# retain microbiota speices with prevalence > 5% and relative abundance > 1E-4
#1.prevalence > 5%
zero_counts <- vector("integer", nrow(data))
for (i in 1:nrow(data)) {
count <- 0
for (j in 1:ncol(data)) {
if (data[i, j] == 0) {
count <- count + 1
}
}
zero_counts[i] <- count
}
# output
zero_count = as.data.frame(zero_counts)
data2 = data
data2$zero_counts = zero_count$zero_counts
data2$all_counts = 20
data2$sample_percent = round(1-data2$zero_counts/data2$all_counts, 6)
data3 = data2 %>% filter(data2$sample_percent >= 0.05)
data3 = data3[, -c(21, 22, 23)]
# check data
dim(data3)
[1] 695 20
data3 = data3 * 100000
#write.table(sub_otutab_npc, file = "results/Correlation_network_analysis/sparcc_p01.txt", row.names = T, sep = "\t", quote = T, col.names = T)
OTU.table.filtered.colnames <- colnames(data3)
OTU.table.filtered.sparcc <- cbind(rownames(data3), data3)
colnames(OTU.table.filtered.sparcc) <- c("OTU_id", OTU.table.filtered.colnames)
OTU.table.filtered.sparcc2 <- t(OTU.table.filtered.sparcc)
OTU.table.filtered.sparcc2 <- OTU.table.filtered.sparcc2[-1,]
OTU.table.filtered.sparcc2 <- as.data.frame(OTU.table.filtered.sparcc2)
OTU.table.filtered.sparcc2$group <- rownames(OTU.table.filtered.sparcc2)
OTU.table.filtered.sparcc2$group = as.character(OTU.table.filtered.sparcc2$group)
#OTU.table.filtered.sparcc2$group = sub("[0-9]","_", OTU.table.filtered.sparcc2$group)
OTU.table.filtered.sparcc2$group = gsub("[0-9]","", OTU.table.filtered.sparcc2$group)
OTU.table.filtered.sparcc2$group = gsub("Healthy","Control", OTU.table.filtered.sparcc2$group)
otutab <- as.data.frame(t(OTU.table.filtered.sparcc2))
# Select by manual set group
# NPC group
if (TRUE){
sub_design = subset(design, Group %in% c("Patients"))
sub_design$Group = factor(sub_design$Group, levels=c("Patients"))
}
idx = rownames(sub_design) %in% colnames(otutab)
sub_design_npc = sub_design[idx,]
sub_otutab_npc = otutab[,rownames(sub_design_npc)]
sub_otutab_npc = sub_otutab_npc[-37, ]
sub_otutab_npc <- as.data.frame(sub_otutab_npc)
write.table(sub_otutab_npc, file = "results/Correlation_network_analysis/species_sparcc_p01.txt",
row.names = T, sep = "\t", quote = T, col.names = T)
# Healthy group
if (TRUE){
sub_design = subset(design, Group %in% c("Control"))
sub_design$Group = factor(sub_design$Group, levels=c("Control"))
}
idx = rownames(sub_design) %in% colnames(otutab)
sub_design_healthy = sub_design[idx,]
sub_otutab_healthy = otutab[,rownames(sub_design_healthy)]
sub_otutab_healthy = sub_otutab_healthy[-37, ]
sub_otutab_healthy = as.data.frame(sub_otutab_healthy)
write.table(sub_otutab_healthy, file = "results/Correlation_network_analysis/species_sparcc_h01.txt",
row.names = T, sep = "\t", quote = T, col.names = T)
# OTU.table.filtered.sparcc <- OTU.table.filtered.sparcc[, -1]
# We need to transpose the table
# Patients group
#write.table(sub_otutab_npc, file = "results/genus_sparcc_p01_11R_2.tsv", row.names = T, sep = "\t", quote = T, col.names = T)
#write.table(sub_otutab_healthy, file = "results/genus_sparcc_h01_11R_2.tsv", row.names = T, sep = "\t", quote = T, col.names = T)
## 2. SparCC correlation analysis
#!/usr/bin/bash
# install FastSpar
#conda install -c bioconda -c conda-forge fastspar
# Convert CSV file to TSV file
#awk -F ',' 'BEGIN {OFS="\t"} {$1=$1}1' tests/data/sparcc_npc_h01_11R.txt > tests/data/sparcc_npc_h01_11R.tsv
#awk -F ',' 'BEGIN {OFS="\t"} {$1=$1}1' tests/data/sparcc_npc_p01_11R.txt > tests/data/sparcc_npc_p01_11R.tsv
# Control group
# Correlation inference
# test run
#fastspar --otu_table tests/data/fake_data.tsv --correlation median_correlation.tsv --covariance median_covariance.tsv
#fastspar --otu_table tests/data/Species_SparCC_HC_table5.tsv --correlation median_correlation_HC.tsv --covariance median_covariance_HC.tsv
# change iterations
#fastspar --iterations 100 --exclude_iterations 20 --otu_table tests/data/fake_data.tsv --correlation median_correlation.tsv --covariance median_covariance.tsv
#fastspar --iterations 100 --exclude_iterations 20 --otu_table tests/data/Species_SparCC_HC_table5.tsv --correlation median_correlation_HC.tsv --covariance median_covariance_HC.tsv
# change thresholds
#fastspar --threshold 0.1 --otu_table tests/data/fake_data.tsv --correlation median_correlation.tsv --covariance median_covariance.tsv
#fastspar --iterations 100 --threads 10 --threshold 0.1 --otu_table tests/data/Species_SparCC_HC_table5.tsv --correlation median_correlation_HC.tsv --covariance median_covariance_HC.tsv
# combine
#fastspar --iterations 100 --threads 3 --threshold 0.1 --otu_table tests/data/Species_SparCC_HC_table5.tsv --correlation median_correlation_HC.tsv --covariance median_covariance_HC.tsv
# Calculation of exact *p*-values
# First we generate the 1000 bootstrap counts:
#conda install -c intel mkl
#mkdir bootstrap_counts_HC
#fastspar_bootstrap --otu_table tests/data/Species_SparCC_HC_table5.tsv --number 1000 --prefix bootstrap_counts_HC/HC_data
# And then infer correlations for each bootstrap count (running in parallel with all processes available):
#mkdir bootstrap_correlation
#parallel fastspar --otu_table {} --correlation bootstrap_correlation/cor_{/} --covariance bootstrap_correlation/cov_{/} -i 5 ::: bootstrap_counts/*
# From these correlations, the *p*-values are then calculated:
#fastspar_pvalues --otu_table tests/data/fake_data.tsv --correlation median_correlation.tsv --prefix bootstrap_correlation/cor_fake_data_ --permutations 1000 --outfile pvalues.tsv
# add threads to accelerate
#fastspar --otu_table tests/data/fake_data.txt --correlation median_correlation.tsv --covariance median_covariance.tsv --iterations 50 --threads 10
# Alternative solutions
# Using sparcc in iNAP online website for analysis, the website is doing really well
# https://inap.denglab.org.cn/
## 3.Visualization based on Gephi software
library(plyr)
library(magrittr)
library(tidyr)
library(dplyr)
library(igraph)
# npc
r.cor <- read.table("data/r.cor.txt", sep="\t", header=T, check.names=F,row.names = 1)
p.cor <- read.table("data/p.cor.txt", sep="\t", header=T, check.names=F,row.names = 1)
r.cor[p.cor>0.05] <- 0
# Build network connection attributes and node attributes
# Convert data to long format for merging and add connection properties
r.cor$from = rownames(r.cor)
p.cor$from = rownames(p.cor)
p_value <- p.cor %>%
gather(key = "to", value = "p", -from) %>%
data.frame()
p_value$FDR <- p.adjust(p_value$p,"BH")
p_value <- p_value[, -3]
# cor.data<- r.cor %>%
# gather(key = "to", value = "r", -from) %>%
# data.frame() %>%
# left_join(p_value, by=c("from","to")) %>%
# #diff$p.value <- p.adjust(diff$p.value,"BH")
# #filter(FDR <= 1e-5, from != to) %>%
# filter(FDR <= 0.1, from != to) %>%
# filter(abs(r) >= 0.15, from != to) %>%
# #filter(p <= 0.001, from != to) %>%
# plyr::mutate(
# linecolor = ifelse(r > 0,"positive","negative"),
# linesize = abs(r)
# )
cor.data<- r.cor %>%
gather(key = "to", value = "r", -from) %>%
data.frame() %>%
left_join(p_value, by=c("from","to"))
cor.data <- as.data.frame(cor.data)
cor.data <- cor.data[cor.data$FDR <= 0.1 & cor.data$from != cor.data$to, ]
cor.data <- cor.data[abs(cor.data$r) >= 0.15 & cor.data$from != cor.data$to, ]
cor.data <- cor.data %>%
plyr::mutate(
linecolor = ifelse(r > 0,"positive","negative"),
linesize = abs(r)
)
# Set node properties
vertices <- c(as.character(cor.data$from),as.character(cor.data$to)) %>%
as_tibble() %>%
group_by(value) %>%
clusterProfiler::summarise()
colnames(vertices) <- "name"
# Build graph data structure and add network basic attributes, save data
# Building a graph data structure
graph <- graph_from_data_frame(cor.data, vertices = vertices, directed = FALSE)
E(graph)$weight <- abs(E(graph)$r)
V(graph)$label <- V(graph)$name
# save data
write_graph(graph, "results/Correlation_network_analysis/patients01.graphml", format="graphml")
# Visualized in Gephi software or Cytoscape software
# The same procedure for healthy group
利用Gephi可视化
# Load packages
library(igraph) # Network Analysis and Visualization
library(Hmisc) # Harrell Miscellaneous
library(psych) # Procedures for Psychological, Psychometric, and Personality Research
library(dplyr) # A Grammar of Data Manipulation
library(tidyr) # Tidy Messy Data
conflicts_prefer(ggpubr::mutate)
conflicts_prefer(dplyr::summarise)
conflicts_prefer(plyr::arrange)
mic <- read.table("data/Spearman_network_data.txt", sep="\t", header=T, check.names=F,row.names = 1)
mic = apply(mic, 2, function(x) x/100)
gene <- read.table("data/Spearman_KO_data.txt", sep="\t", header=T, check.names=F,row.names = 1)
group <- read.table("data/group2.txt", sep="\t", header=T, check.names=F)
mic <- as.data.frame(t(mic))
mic$sample <- rownames(mic)
gene <- as.data.frame(t(gene))
gene$sample <- rownames(gene)
df <- merge(mic, gene, by = "sample")
rownames(df) <- df$sample
df <- df[-1]
head(df)
Species01 Species02 Species03 Species04 Species05 Species06
Patients01 0.0001992 0.0e+00 0 0.00e+00 0.0000000 0.0002268
Patients02 0.0000000 0.0e+00 0 0.00e+00 0.0000300 0.0037749
Patients03 0.0000000 4.7e-06 0 0.00e+00 0.0000000 0.0078025
Patients04 0.0000000 0.0e+00 0 0.00e+00 0.0000000 0.0013194
Patients05 0.0000000 0.0e+00 0 0.00e+00 0.0000000 0.0000000
Patients06 0.0005237 0.0e+00 0 4.71e-05 0.0002594 0.0027443
Species07 Species08 Species09 Species10 Species11 Species12
Patients01 0.0000000 0.0000000 0.0002184 0.0000000 0.0000000 0.0000449
Patients02 0.0000383 0.0047634 0.0099377 0.0000000 0.0000000 0.0000000
Patients03 0.0003397 0.0000000 0.0000000 0.0000000 0.0008635 0.0121608
Patients04 0.0008964 0.0004650 0.0290622 0.0000466 0.0000000 0.0000000
Patients05 0.0000000 0.0000000 0.0000000 0.0000000 0.0008143 0.0000000
Patients06 0.0000000 0.0008082 0.0009162 0.0009799 0.0006761 0.0001429
Species13 Species14 Species15 Species16 Species17 Species18
Patients01 0.0088900 0.0030027 0.0484625 0.0535034 0 0.0004194
Patients02 0.0000884 0.0000111 0.1454322 0.0032423 0 0.0000000
Patients03 0.0171689 0.0000000 0.0000363 0.0000000 0 0.0046459
Patients04 0.0006176 0.0035240 0.1092701 0.0085378 0 0.0015125
Patients05 0.0000576 0.0000407 0.0039117 0.0004556 0 0.0003702
Patients06 0.0001377 0.0000526 0.0011158 0.0000696 0 0.0016015
Species19 Species20 Species21 Species22 Species23 Species24
Patients01 0.000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
Patients02 0.000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
Patients03 0.000996 0.0069058 0.0078448 0.0008736 0.0005681 0.0002662
Patients04 0.000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
Patients05 0.000000 0.0009546 0.0012607 0.0000000 0.0000000 0.0000000
Patients06 0.000034 0.0027267 0.0014984 0.0002432 0.0000965 0.0001687
Species25 Species26 Species27 Species28 Species29 Species30
Patients01 0.0000000 0.0000000 0.0000000 0.0060147 0.00e+00 0.0000000
Patients02 0.0000000 0.0000000 0.0000000 0.0000217 0.00e+00 0.0000000
Patients03 0.0000000 0.0132018 0.0051837 0.0008164 7.87e-05 0.0000000
Patients04 0.0000000 0.0019413 0.0015781 0.0000000 0.00e+00 0.0000000
Patients05 0.0000000 0.0003286 0.0000000 0.0298093 0.00e+00 0.0003695
Patients06 0.0001864 0.0036324 0.0084354 0.0021779 9.20e-06 0.0000000
Species31 Species32 Species33 Species34 Species35 Species36
Patients01 0 0.00e+00 0.00e+00 0.0000000 0.0000000 0.0000000
Patients02 0 0.00e+00 0.00e+00 0.0000000 0.0000000 0.0000551
Patients03 0 0.00e+00 0.00e+00 0.0024491 0.0006197 0.0006049
Patients04 0 0.00e+00 0.00e+00 0.0000000 0.0000000 0.0028813
Patients05 0 0.00e+00 0.00e+00 0.0000000 0.0000000 0.0000000
Patients06 0 5.93e-05 1.51e-05 0.0026383 0.0000000 0.0001643
Species37 Species38 Species39 Species40 Species41 Species42
Patients01 0.0042952 0 0 0.0000000 0.00e+00 0.0000168
Patients02 0.0000000 0 0 0.0066072 0.00e+00 0.0001719
Patients03 0.0000594 0 0 0.0002051 2.90e-06 0.0022513
Patients04 0.0000000 0 0 0.0000000 0.00e+00 0.0001092
Patients05 0.0000000 0 0 0.0033590 0.00e+00 0.0000000
Patients06 0.0003306 0 0 0.0000000 3.33e-05 0.0000000
Species43 Species44 Species45 Species46 Species47 Species48
Patients01 0.0000000 0.0005825 0.0000000 0 0.0000000 0.00e+00
Patients02 0.0000000 0.0000000 0.0000000 0 0.0000000 0.00e+00
Patients03 0.0016647 0.0159168 0.0039999 0 0.0000000 6.45e-05
Patients04 0.0000000 0.0000000 0.0000000 0 0.0070733 0.00e+00
Patients05 0.0000000 0.0000000 0.0000000 0 0.0306763 0.00e+00
Patients06 0.0000000 0.0014860 0.0007880 0 0.0026179 0.00e+00
Species49 Species50 Species51 Species52 Species53 Species54
Patients01 0 0.0000000 0.0022000 0.0000000 0.0001471 0.0035067
Patients02 0 0.0000000 0.0000000 0.0000000 0.0000000 0.0244186
Patients03 0 0.0067599 0.0154999 0.0010194 0.0000000 0.0006736
Patients04 0 0.0007462 0.0000000 0.0000000 0.0002468 0.0049033
Patients05 0 0.0011148 0.0000000 0.0029021 0.0120790 0.0742965
Patients06 0 0.0000000 0.0021384 0.0001013 0.0000487 0.0009094
Species55 Species56 Species57 Species58 Species59 Species60
Patients01 0.0006575 0.0000137 0.0006329 0.0068236 0.0000000 0.0000000
Patients02 0.0066833 0.0006340 0.0007965 0.0031478 0.0001694 0.0411671
Patients03 0.0006155 0.0000000 0.0000000 0.0000000 0.0000000 0.0000824
Patients04 0.0002064 0.0014813 0.0001085 0.0008470 0.0000720 0.0000836
Patients05 0.0001462 0.0006683 0.0000000 0.0035369 0.0014096 0.0092520
Patients06 0.0004803 0.0000000 0.0000000 0.0005778 0.0000297 0.0104302
Species61 Species62 Species63 Species64 Species65 Species66
Patients01 0.0000000 0 0.0000000 0.0000000 0.0000473 0.0000000
Patients02 0.0000000 0 0.0000000 0.0000000 0.0000000 0.0000000
Patients03 0.0020968 0 0.0031690 0.0001918 0.0059577 0.0080153
Patients04 0.0000000 0 0.0000409 0.0000129 0.0000000 0.0006103
Patients05 0.0000000 0 0.0000000 0.0000000 0.0000000 0.0000000
Patients06 0.0007391 0 0.0000000 0.0000000 0.0000062 0.0000000
Species67 Species68 Species69 Species70 Species71 Species72
Patients01 0.0000000 0.0102598 0.0014105 0.0000000 0 0e+00
Patients02 0.0000000 0.1318139 0.0000000 0.0024226 0 0e+00
Patients03 0.0059957 0.0300415 0.0509729 0.0000000 0 5e-06
Patients04 0.0000000 0.0012709 0.0000000 0.0000000 0 0e+00
Patients05 0.0000000 0.0353636 0.0000000 0.0000000 0 0e+00
Patients06 0.0000690 0.0002325 0.0058942 0.0000000 0 0e+00
Species73 Species74 Species75 Species76 Species77 Species78
Patients01 0.0000000 0.0054484 0 0.0000000 0.0000000 0
Patients02 0.0000000 0.0000000 0 0.0000000 0.0000000 0
Patients03 0.0000126 0.0000000 0 0.0000000 0.0004047 0
Patients04 0.0000000 0.0000000 0 0.0000000 0.0000000 0
Patients05 0.0000000 0.0000000 0 0.0000000 0.0000000 0
Patients06 0.0001204 0.0000000 0 0.0001239 0.0000000 0
Species79 Species80 Species81 Species82 Species83 Species84
Patients01 0 0.0001895 0.00e+00 0.0000000 0.1107871 0.0e+00
Patients02 0 0.0020453 0.00e+00 0.0000569 0.0000000 0.0e+00
Patients03 0 0.0000372 1.96e-05 0.0003917 0.0019890 0.0e+00
Patients04 0 0.0001923 0.00e+00 0.0000000 0.0000000 0.0e+00
Patients05 0 0.0036031 0.00e+00 0.0001910 0.0000000 0.0e+00
Patients06 0 0.0000000 0.00e+00 0.0254755 0.0000000 4.6e-06
Species85 Species86 Species87 Species88 Species89 Species90
Patients01 0 0.0060479 0.0000000 5.30e-06 1.2e-05 0.0000000
Patients02 0 0.0000000 0.0000000 0.00e+00 0.0e+00 0.0000000
Patients03 0 0.0002256 0.0039633 1.04e-05 0.0e+00 0.0005639
Patients04 0 0.0000000 0.0000000 0.00e+00 0.0e+00 0.0001344
Patients05 0 0.0000000 0.0000000 0.00e+00 0.0e+00 0.0000169
Patients06 0 0.0002721 0.0000000 0.00e+00 0.0e+00 0.0000567
Species91 Species92 Species93 Species94 Species95 Species96
Patients01 0.0000000 0.0000000 0.0000000 0 0.0024523 0.0000000
Patients02 0.0084149 0.0000000 0.0287016 0 0.0055949 0.0000000
Patients03 0.0000000 0.0030147 0.0000140 0 0.0005346 0.0001082
Patients04 0.0010065 0.0000000 0.0000000 0 0.0000750 0.0000000
Patients05 0.0000000 0.0000000 0.0001670 0 0.0005845 0.0000000
Patients06 0.0000000 0.0000000 0.0000105 0 0.0000000 0.0000175
Species97 Species98 Species99 Species100 Species101 Species102
Patients01 0.0000000 0.0000000 5.51e-04 0.0009535 0 0.0000000
Patients02 0.0000000 0.0000000 0.00e+00 0.0000000 0 0.0000000
Patients03 0.0000000 0.0062379 8.74e-05 0.0102851 0 0.0005359
Patients04 0.0000000 0.0000000 0.00e+00 0.0000000 0 0.0000000
Patients05 0.0000000 0.0000000 0.00e+00 0.0000000 0 0.0000000
Patients06 0.0007471 0.0022021 0.00e+00 0.0001538 0 0.0000000
Species103 Species104 Species105 Species106 Species107 Species108
Patients01 0.0000000 0.0001452 0.0025476 0.0000223 0 0.0000000
Patients02 0.0000000 0.0001737 0.0000000 0.0000000 0 0.0000000
Patients03 0.0000348 0.0001332 0.0082329 0.0000000 0 0.0176138
Patients04 0.0000000 0.0000000 0.0317250 0.0238322 0 0.0000000
Patients05 0.0016034 0.0000000 0.0000000 0.0000546 0 0.0000000
Patients06 0.0000000 0.0000000 0.0000784 0.0000000 0 0.0000000
Species109 Species110 Species111 Species112 Species113 Species114
Patients01 0.0032992 0 0.0000000 0.0027984 0 0.0005566
Patients02 0.0003825 0 0.0000000 0.0221687 0 0.0003348
Patients03 0.0078424 0 0.0509262 0.0093032 0 0.0000767
Patients04 0.0000000 0 0.0007866 0.0006682 0 0.0000000
Patients05 0.0003675 0 0.0170159 0.0194056 0 0.0000000
Patients06 0.0002681 0 0.0012940 0.0041489 0 0.0000000
Species115 Species116 Species117 Species118 Species119 Species120
Patients01 0.0002689 0.0000000 0.0018668 0.00e+00 0 0
Patients02 0.0000000 0.0002194 0.0000000 0.00e+00 0 0
Patients03 0.0000000 0.0000000 0.0008595 7.08e-05 0 0
Patients04 0.0000000 0.0000000 0.0000000 0.00e+00 0 0
Patients05 0.0000000 0.0000000 0.0000000 0.00e+00 0 0
Patients06 0.0000000 0.0000000 0.0000000 0.00e+00 0 0
K00001 K00002 K00003 K00004 K00005 K00006 K00007
Patients01 4.85e-07 4.06e-06 1.15e-06 1.95e-06 3.76e-06 0.00e+00 2.38e-06
Patients02 6.72e-06 8.98e-06 0.00e+00 3.08e-06 7.69e-07 2.80e-06 2.47e-05
Patients03 5.42e-07 0.00e+00 2.86e-06 5.00e-06 5.80e-06 0.00e+00 6.66e-06
Patients04 0.00e+00 0.00e+00 0.00e+00 0.00e+00 2.48e-07 0.00e+00 0.00e+00
Patients05 1.71e-06 2.92e-06 6.80e-07 9.30e-07 0.00e+00 9.66e-07 8.15e-06
Patients06 4.28e-06 8.78e-06 0.00e+00 2.83e-06 0.00e+00 2.56e-06 7.07e-06
K00008 K00009 K00010 K00011 K00012 K00013 K00014
Patients01 0.00e+00 0.00e+00 0.00e+00 0.00e+00 4.60e-06 3.43e-06 0.00e+00
Patients02 1.57e-06 1.42e-05 5.15e-06 1.15e-05 6.37e-06 1.58e-06 6.71e-06
Patients03 0.00e+00 1.26e-06 0.00e+00 2.10e-07 0.00e+00 1.02e-05 0.00e+00
Patients04 0.00e+00 0.00e+00 0.00e+00 5.67e-07 0.00e+00 4.36e-07 0.00e+00
Patients05 6.90e-07 3.82e-06 1.11e-06 1.90e-06 4.16e-07 1.13e-06 0.00e+00
Patients06 3.53e-06 1.53e-05 1.82e-06 9.00e-06 2.71e-06 2.62e-05 3.84e-06
K00015 K00016 K00017 K00018 K00019 K00020 K00021
Patients01 4.16e-06 0.00e+00 7.39e-06 2.18e-06 0.00e+00 5.63e-07 5.69e-06
Patients02 7.12e-06 6.36e-06 2.17e-07 2.82e-06 1.06e-05 7.09e-06 3.04e-07
Patients03 0.00e+00 0.00e+00 1.41e-06 0.00e+00 0.00e+00 3.12e-07 4.54e-06
Patients04 0.00e+00 0.00e+00 1.08e-05 0.00e+00 0.00e+00 5.37e-06 8.31e-07
Patients05 2.88e-06 1.64e-06 4.78e-07 1.17e-06 2.58e-06 5.91e-06 1.76e-06
Patients06 2.91e-06 3.07e-06 1.09e-06 8.25e-06 3.69e-06 1.35e-05 1.16e-05
K00022 K00023 K00024 K00025 K00026 K00027 K00028
Patients01 2.36e-07 0.00e+00 5.15e-07 0.00e+00 0.00e+00 0.00e+00 0.00e+00
Patients02 0.00e+00 1.38e-05 2.16e-07 1.48e-05 1.34e-06 7.81e-06 4.42e-06
Patients03 7.92e-06 0.00e+00 1.01e-05 2.60e-07 0.00e+00 2.45e-07 0.00e+00
Patients04 2.48e-07 0.00e+00 2.70e-07 0.00e+00 1.27e-06 0.00e+00 8.89e-08
Patients05 0.00e+00 6.15e-07 9.26e-07 0.00e+00 1.00e-05 2.44e-07 1.95e-06
Patients06 4.52e-06 1.73e-06 2.85e-06 1.61e-06 7.37e-06 2.81e-06 5.69e-06
K00029 K00030 K00031 K00032 K00033 K00034 K00035
Patients01 1.35e-06 5.17e-06 0.00e+00 2.37e-06 0.00e+00 0.00e+00 0.00e+00
Patients02 0.00e+00 1.67e-06 6.73e-06 0.00e+00 1.39e-06 7.30e-06 2.10e-06
Patients03 3.20e-06 1.47e-05 0.00e+00 1.13e-05 1.44e-07 0.00e+00 0.00e+00
Patients04 0.00e+00 3.29e-07 0.00e+00 0.00e+00 1.03e-05 0.00e+00 0.00e+00
Patients05 4.95e-07 2.16e-06 3.38e-07 1.37e-06 3.98e-06 7.01e-07 2.31e-07
Patients06 1.14e-06 6.32e-06 5.09e-06 1.79e-05 1.20e-05 1.66e-06 1.75e-06
K00036 K00037 K00038 K00039
Patients01 2.19e-07 2.39e-06 5.54e-07 0.00e+00
Patients02 0.00e+00 0.00e+00 0.00e+00 1.33e-05
Patients03 1.94e-06 4.90e-06 7.85e-06 0.00e+00
Patients04 2.62e-07 2.64e-07 7.76e-06 0.00e+00
Patients05 4.19e-07 0.00e+00 0.00e+00 2.13e-06
Patients06 2.45e-05 0.00e+00 1.69e-05 5.62e-06
# Correlations were calculated and data were processed with p>0.05 as the screening threshold
data<-as.matrix(df)
cor<- corr.test(data, method="spearman",adjust="BH")
data.cor <- as.data.frame(cor$r)
r.cor<-data.frame(cor$r)[1:159,1:159]
p.cor<-data.frame(cor$p)[1:159,1:159]
r.cor[p.cor>0.05] <- 0
# r.cor[abs(r.cor) < 0.2] <- 0
# Keep values with |correlation|≥0.2 and p<0.05
# cor_sparcc_npc[abs(cor_sparcc_npc) < 0.2] <- 0
#
# pvals_npc[pvals_npc>=0.05] <- -1
# pvals_npc[pvals_npc<0.05 & pvals_npc>=0] <- 1
# pvals_npc[pvals_npc==-1] <- 0
# Filtered adjacency matrix
# adj_npc <- as.matrix(cor_sparcc_npc) * as.matrix(pvals_npc)
# diag(adj_npc) <- 0 #Convert the diagonal values of the correlation matrix (representing autocorrelations) to 0
# write.table(data.frame(adj_npc, check.names = FALSE), 'iNAP_results2/neetwork.adj_npc.txt', col.names = NA, sep = '\t', quote = FALSE)
# Constructing network connection properties and node properties
r.cor$from = rownames(r.cor)
p.cor$from = rownames(p.cor)
p_value <- p.cor %>%
gather(key = "to", value = "p", -from) %>%
data.frame()
p_value$FDR <- p.adjust(p_value$p,"BH")
p_value <- p_value[, -3]
cor.data<- r.cor %>%
gather(key = "to", value = "r", -from) %>%
data.frame() %>%
left_join(p_value, by=c("from","to")) %>%
#diff$p.value <- p.adjust(diff$p.value,"BH")
#filter(FDR <= 1e-5, from != to) %>%
#filter(p <= 0.001, from != to) %>%
mutate(
linecolor = ifelse(r > 0,"positive","negative"),
linesize = abs(r)
)
write.csv(cor.data, "results/Correlation_network_analysis/Species_KO_Spearman_correlations.csv")
#cor.data <- cor.data[abs(cor.data$r)>0.2, ]
# Setting node properties
vertices <- c(as.character(cor.data$from),as.character(cor.data$to)) %>%
as_tibble() %>%
group_by(value) %>%
summarise()
colnames(vertices) <- "name"
vertices <- vertices %>%
left_join(group,by="name")
vertices$group <- factor(vertices$group, levels = c("Species","KO" ))
vertices <- vertices %>%
arrange(group)
# Constructing graph data structure
graph <- graph_from_data_frame(cor.data, vertices = vertices, directed = FALSE )
E(graph)$weight <- abs(E(graph)$r)
V(graph)$label <- V(graph)$name
# Save data
write_graph(graph, "results/Correlation_network_analysis/Healthy_Spearman_network01.graphml", format="graphml")
# 基于Gephi软件或者Cytoscape软件进行可视化
# Visualization based on Gephi software or Cytoscape software
## 1. data preparation
library(tidyverse)
library(igraph)
library(psych)
### 1.1 Observation-Variable Data Table
data <- read.csv("data/data.csv",header = TRUE,
row.names = 1,
check.names=FALSE,
comment.char = "")
### 1.2 Variable classification table
type = read.csv("data/type.csv",header = TRUE,check.names = FALSE)
## 2. Determine the correlation relationship
### 2.1 Calculate the correlation coefficient. It is recommended to use the spearman coefficient.
cor <- corr.test(data, use = "pairwise",method="spearman",adjust="holm", alpha=.05,ci=FALSE)
cor.r <- data.frame(cor$r)
cor.p <- data.frame(cor$p)
colnames(cor.r) = rownames(cor.r)
# There are special characters in the variable names, and this code must be run to prevent the matrix row names from being inconsistent with the column names.
colnames(cor.p) = rownames(cor.p)
write.csv(cor.r,"results/Correlation_network_analysis/cor.r.csv",quote = FALSE,col.names = NA,row.names = TRUE)
write.csv(cor.p,"results/Correlation_network_analysis/cor.p.csv",quote = FALSE,col.names = NA,row.names = TRUE)
knitr::kable(
head(cor.r),
caption = "cor.r"
)
| pH | AK | AP | OM | NH4 | NO3 | Wc | Pb | Salt | Bacillus firmus | Bacillus megaterium | Bacillus thuringiensis | Bacillus subtilis | Bacillus subterraneus | Bacillus asahii | Bacillus pumilus | Candidimonas bauzanensis | Fictibacillus phosphorivorans | Aspergillus niger | Fusarium oxysporum | Chaetomium globosum | Gymnascella hyalinospora | Penicillium chrysogenum | Fusarium solani | Trichocladium asperum | Aspergillus ustus | Aspergillus terreus | Pseudallescheria boydii | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| pH | 1.0000000 | -0.6684011 | -0.4804946 | -0.2438234 | -0.1007803 | -0.7431736 | -0.0331600 | 0.2223669 | 0.2906374 | 0.2360210 | 0.4208132 | 0.1801592 | -0.5728676 | 0.2342806 | 0.1374839 | -0.3404478 | -0.5884099 | -0.1703104 | 0.1398375 | 0.1720326 | -0.2055286 | 0.3183329 | -0.3480111 | -0.2995723 | -0.0162813 | -0.3694945 | 0.0717738 | -0.3297533 |
| AK | -0.6684011 | 1.0000000 | 0.7623377 | 0.7324675 | 0.2948052 | 0.4610390 | 0.1389610 | -0.2415584 | -0.1246753 | -0.3233766 | -0.4215655 | -0.6297376 | 0.6154758 | -0.0825003 | -0.3579862 | 0.4074773 | 0.5802067 | -0.2235270 | -0.5293927 | -0.3254304 | 0.4780773 | -0.6043092 | 0.5231298 | -0.0520899 | -0.0299187 | 0.3835584 | 0.4179111 | 0.7397974 |
| AP | -0.4804946 | 0.7623377 | 1.0000000 | 0.7207792 | 0.2025974 | 0.2194805 | 0.1194805 | 0.0935065 | 0.2103896 | 0.0259740 | -0.1292628 | -0.7699733 | 0.4111930 | 0.2704926 | -0.2560826 | 0.1296519 | 0.4747146 | -0.1055725 | -0.2936018 | -0.1740825 | 0.5644690 | -0.6752499 | 0.3765492 | -0.1656192 | -0.1372360 | 0.4106172 | 0.3124190 | 0.6031986 |
| OM | -0.2438234 | 0.7324675 | 0.7207792 | 1.0000000 | 0.4441558 | 0.1688312 | 0.4649351 | -0.1311688 | 0.0441558 | -0.1415584 | -0.1656382 | -0.8030478 | 0.4249870 | 0.2055744 | -0.3354880 | 0.2659186 | 0.3097141 | -0.4359754 | -0.4923677 | -0.3715492 | 0.5508282 | -0.6831322 | 0.2801318 | -0.2457575 | 0.0637399 | 0.2130880 | 0.5004114 | 0.6613545 |
| NH4 | -0.1007803 | 0.2948052 | 0.2025974 | 0.4441558 | 1.0000000 | 0.0077922 | 0.9038961 | -0.4246753 | 0.0064935 | -0.2935065 | -0.2695681 | -0.3267756 | 0.3317132 | 0.3015993 | -0.3818079 | 0.2169684 | 0.2569680 | -0.2587179 | -0.1994154 | -0.4767782 | 0.1721338 | -0.3652130 | 0.2123790 | -0.0213702 | 0.3700818 | 0.3077938 | 0.2840173 | 0.2515582 |
| NO3 | -0.7431736 | 0.4610390 | 0.2194805 | 0.1688312 | 0.0077922 | 1.0000000 | 0.0480519 | -0.5220779 | -0.5142857 | -0.0649351 | -0.2312439 | -0.1203910 | 0.6404364 | -0.1487710 | 0.0119108 | 0.1997697 | 0.2907796 | 0.2091900 | -0.3104904 | -0.3760962 | 0.3683014 | -0.1931162 | 0.1302939 | 0.3940134 | 0.1333335 | 0.0148823 | -0.2312712 | 0.2569680 |
#head(cor.p)
## 2.2 Determine the correlation relationship
# Keep the correlation between variables with p<=0.05 and abs(r)>=0.6
# cor.r[abs(cor.r) < 0.6 | cor.p > 0.05] = 0
# cor.r = as.matrix(cor.r)
# g = graph_from_adjacency_matrix(cor.r,mode = "undirected",weighted = TRUE,diag = FALSE)
## Convert the data to long format for filtering. Node and link data are needed to draw the network diagram later. This step can complete the formatting.
cor.r$node1 = rownames(cor.r)
cor.p$node1 = rownames(cor.p)
r = cor.r %>%
gather(key = "node2", value = "r", -node1) %>%
data.frame()
p = cor.p %>%
gather(key = "node2", value = "p", -node1) %>%
data.frame()
#head(r)
#head(p)
## Combine r and p values into one data table
cor.data <- merge(r,p,by=c("node1","node2"))
#head(cor.data)
## Keep the correlation between variables with p<=0.05 and abs(r)>=0.6, and add network attributes
cor.data <- cor.data %>%
filter(abs(r) >= 0.6, p <= 0.05, node1 != node2) %>%
mutate(
linetype = ifelse(r > 0,"positive","negative"),
linesize = abs(r)
)
#head(cor.data)
## 3. Constructing the network graph data structure
# After building the network in this step, you also need to convert the network graph into a simple graph and remove duplicate links.
### 3.1 Network graph node attribute arrangement
#### Calculate the number of links each node has
conflicts_prefer(dplyr::summarize)
c(as.character(cor.data$node1),as.character(cor.data$node2)) %>%
as_tibble() %>%
group_by(value) %>%
summarize(n=n()) -> vertices
colnames(vertices) <- c("node", "n")
#head(vertices)
#### Add variable classification attributes
vertices %>%
select(-n) %>%
left_join(type,by="node") -> vertices
#### The nodes in the network diagram will be drawn in sequence according to the order of the node attribute file. In order to make the variables of the same type close together, the nodes are sorted according to the node attributes.
vertices$type = factor(vertices$type,levels = unique(vertices$type))
vertices = arrange(vertices,type)
write.csv(vertices,"results/Correlation_network_analysis/vertices.csv",quote = FALSE,col.names = NA,row.names = FALSE)
head(vertices)
# A tibble: 6 × 2
node type
<chr> <fct>
1 AK Soil physi-chemical factors
2 AP Soil physi-chemical factors
3 NH4 Soil physi-chemical factors
4 NO3 Soil physi-chemical factors
5 OM Soil physi-chemical factors
6 Wc Soil physi-chemical factors
### 3.2 Build graph structure data
g <- graph_from_data_frame(cor.data, vertices = vertices, directed = FALSE )
#g
vcount(g)
[1] 21
ecount(g)
[1] 42
#get.vertex.attribute(g) # View the node attributes contained in the graph
#get.edge.attribute(g) # View the link properties contained in the graph
## 3.3 Simple Graph
is.simple(g) # For non-simple graphs, the number of links will be too high, so it needs to be converted to a simple graph
[1] FALSE
E(g)$weight <- 1
g <- igraph::simplify(g,
remove.multiple = TRUE,
remove.loops = TRUE,
edge.attr.comb = "first")
# g <- delete.vertices(g,which(degree(g) == 0)) # Delete isolated points
is.simple(g)
[1] TRUE
E(g)$weight <- 1
is.weighted(g)
[1] TRUE
vcount(g)
[1] 21
ecount(g)
[1] 33
### 3.4 Calculating the number of node links
conflicts_prefer(igraph::degree)
V(g)$degree <- degree(g)
#vertex.attributes(g)
#edge.attributes(g)
#g
### 3.5 Save the graph locally
# Directly save the graph structure. GML can save the most graph information.
write.graph(g,file = "results/Correlation_network_analysis/all.gml",format="gml")
net.data <- igraph::as_data_frame(g, what = "both")$edges # Extract link properties
write.csv(net.data,"results/Correlation_network_analysis/net.data.csv",quote = FALSE,col.names = NA,row.names = FALSE)
head(net.data)
from to r p linetype linesize
1 AK AP 0.7623377 0.0220369496 positive 0.7623377
2 AK OM 0.7324675 0.0001595923 positive 0.7324675
3 AK pH -0.6684011 0.0009260728 negative 0.6684011
4 AK Gymnascella hyalinospora -0.6043092 0.0037140827 negative 0.6043092
5 AK Pseudallescheria boydii 0.7397974 0.0469368082 positive 0.7397974
6 AK Bacillus subtilis 0.6154758 0.0029781355 positive 0.6154758
weight
1 1
2 1
3 1
4 1
5 1
6 1
vertices <- igraph::as_data_frame(g, what = "both")$vertices # Extracting node attributes
write.csv(vertices,"results/Correlation_network_analysis/vertices.csv",quote = FALSE,col.names = NA,row.names = FALSE)
# head(vertices)
# Directly read previously saved link and node attribute files, and then directly generate graphs or use them for drawing in other drawing software.
## 4. Draw a group network diagram
### 4.1 Preparing network diagram layout data
#?layout_in_circle
layout1 <- layout_in_circle(g)
layout2 <- layout_with_fr(g)
layout3 <- layout_on_grid(g)
layout4 <- layout_nicely(g)
layout5 <- layout_with_graphopt(g)
#head(layout1)
### 4.2 Setting the drawing color
#?rgb()
### Set the background color of nodes and groups
# color <- c(rgb(65,179,194,maxColorValue = 255),
# rgb(255,255,0,maxColorValue = 255),
# rgb(201,216,197,maxColorValue = 255))
color <- c("#5ebcc2","#ce77ad","#879b56")
names(color) <- unique(V(g)$type) # Name the color based on the node classification attribute
V(g)$point.col <- color[match(V(g)$type,names(color))] # Set node color
# names(color2) <- unique(V(g)$type)
# If you want the node color to be different from the background color, you can set a separate color set for the node.
# V(g)$point.col <- color2[match(V(g)$type,names(color2))]
#### The edge color is set according to the positive or negative correlation
# E(g)$color <- ifelse(E(g)$linetype == "positive",rgb(255,215,0,maxColorValue = 255),"gray50")
E(g)$color <- ifelse(E(g)$linetype == "positive","red",rgb(0,147,0,maxColorValue = 255))
# E(g)$color = ifelse(E(g)$r>0,rgb(254/255,67/255,101/255,abs(E(g)$r)),rgb(0/255,0/255,255/255,abs(E(g)$r)))
### 4.3 Draw a radial layout network diagram
pdf("results/Correlation_network_analysis/network_group_circle.pdf",family = "Times",width = 10,height = 12)
par(mar=c(5,2,1,2))
plot.igraph(g, layout=layout1,
##Node color setting parameters
vertex.color=V(g)$point.col,
vertex.frame.color ="black",
vertex.border=V(g)$point.col,
##Node size setting parameters
shape = 1,# Set point shape
vertex.size=V(g)$degree*2,
rescale =TRUE,
##Node label setting parameters
vertex.label=g$name,
vertex.label.cex=0.8,
## The distance between the label and the node center. 0 means the label is at the node center.
vertex.label.dist=0,
## Labels for node positions, 0-right, pi-left, -pi/2-top, pi/2-bottom.
vertex.label.col="black",
## Set the node group list and draw the group background color
mark.groups =list(V(g)$name[V(g)$type %in% names(color)[1]],
V(g)$name[V(g)$type %in% names(color)[2]],
V(g)$name[V(g)$type %in% names(color)[3]]
),
mark.col=color,
mark.border=color,
## Link attribute parameters start with edge*
edge.arrow.size=0.5,
edge.width=abs(E(g)$r)*2,
edge.curved = TRUE
)
## Set legend
legend(
title = "Type",
list(x = min(layout1[,1])-0.2,
y = min(layout1[,2])-0.17), # The position of the legend needs to be adjusted according to your own data
legend = c(unique(V(g)$type)),
fill = color,
#pch=1
)
legend(
title = "|r-value|",
list(x = min(layout1[,1])+0.4,
y = min(layout1[,2])-0.17),
legend = c(0.6,0.8,1.0),
col = "black",
lty=1,
lwd=c(0.6,0.8,1.0)*2,
)
legend(
title = "Correlation (±)",
list(x = min(layout1[,1])+0.8,
y = min(layout1[,2])-0.17),
legend = c("positive","negative"),
col = c("red",rgb(0,147,0,maxColorValue = 255)),
lty=1,
lwd=1
)
legend(
title = "Degree",
list(x = min(layout1[,1])+1.2,
y = min(layout1[,2])-0.17),
legend = c(1,seq(0,8,2)[-1]),# V(g)$degree。
col = "black",
pch=1,
pt.lwd=1,
pt.cex=c(1,seq(0,8,2)[-1])
)
dev.off()
png
2
### 4.4 Draw fr layout network diagram-no background color
pdf("results/Correlation_network_analysis/network_group_fr.pdf",family = "Times",width = 10,height = 12)
par(mar=c(5,2,1,2))
plot.igraph(g, layout=layout2,
vertex.color=V(g)$point.col,
vertex.frame.color ="black",
vertex.border=V(g)$point.col,
vertex.size=V(g)$degree*2,
vertex.label=g$name,
vertex.label.cex=0.8,
vertex.label.col="black",
edge.arrow.size=0.5,
edge.width=abs(E(g)$r)*2,
edge.curved = TRUE
)
legend(
title = "Type",
list(x = min(layout1[,1])-0.2,
y = min(layout1[,2])-0.17),
legend = c(unique(V(g)$type)),
fill = color,
#pch=1
)
legend(
title = "|r-value|",
list(x = min(layout1[,1])+0.4,
y = min(layout1[,2])-0.17),
legend = c(0.6,0.8,1.0),
col = "black",
lty=1,
lwd=c(0.6,0.8,1.0)*2,
)
legend(
title = "Correlation (±)",
list(x = min(layout1[,1])+0.8,
y = min(layout1[,2])-0.17),
legend = c("positive","negative"),
col = c("red",rgb(0,147,0,maxColorValue = 255)),
lty=1,
lwd=1
)
legend(
title = "Degree",
list(x = min(layout1[,1])+1.2,
y = min(layout1[,2])-0.17),
legend = c(1,seq(0,8,2)[-1]),# max(V(g)$degree)
col = "black",
pch=1,
pt.lwd=1,
pt.cex=c(1,seq(0,8,2)[-1])
)
dev.off()
png
2
### 4.5 Draw a graphopt layout network diagram - without adding background color
pdf("results/Correlation_network_analysis/network_group_graphopt.pdf",family = "Times",width = 10,height = 12)
par(mar=c(5,2,1,2))
plot.igraph(g, layout=layout5,
vertex.color=V(g)$point.col,
#vertex.frame.color ="black",
vertex.border=V(g)$point.col,
#vertex.size=V(g)$degree*2,
vertex.size=6,
vertex.frame.color="white",
vertex.label=g$name,
vertex.label.cex=0.8,
vertex.label.dist=0,
vertex.label.degree = pi/2,
vertex.label.col="black",
#vertex.frame.color="transparent",
edge.arrow.size=0.5,
edge.width=abs(E(g)$r)*6,
#edge.curved = TRUE,
#edge.label.x = #
#edge.label.y = #
)
legend(
title = "Type",
list(x = min(layout1[,1])-0.2,
y = min(layout1[,2])-0.17),
legend = c(unique(V(g)$type)),
fill = color,
#pch=1
)
legend(
title = "|r-value|",
list(x = min(layout1[,1])+0.4,
y = min(layout1[,2])-0.17),
legend = c(0.6,0.8,1.0),
col = "black",
lty=1,
lwd=c(0.6,0.8,1.0)*2,
)
legend(
title = "Correlation (±)",
list(x = min(layout1[,1])+0.8,
y = min(layout1[,2])-0.17),
legend = c("positive","negative"),
col = c("red",rgb(0,147,0,maxColorValue = 255)),
lty=1,
lwd=1
)
legend(
title = "Degree",
list(x = min(layout1[,1])+1.2,
y = min(layout1[,2])-0.17),
legend = c(1,seq(0,8,2)[-1]),# max(V(g)$degree)
col = "black",
pch=1,
pt.lwd=1,
pt.cex=c(1,seq(0,8,2)[-1])
)
dev.off()
png
2
### 4.6 Set the edge gradient color to indicate the strength of the correlation. The stronger the correlation, the darker the color.
#?layout_in_circle
layout1 <- layout_in_circle(g)
layout2 <- layout_with_fr(g)
layout3 <- layout_on_grid(g)
layout4 <- layout_nicely(g)
layout5 <- layout_with_graphopt(g)
#head(layout1)
#?rgb() 。
color <- c("#5ebcc2","#ce77ad","#879b56")
names(color) <- unique(V(g)$type)
V(g)$point.col <- color[match(V(g)$type,names(color))]
#names(color2) <- unique(V(g)$type)
#V(g)$point.col <- color2[match(V(g)$type,names(color2))]
#E(g)$color <- ifelse(E(g)$linetype == "positive",rgb(255,215,0,maxColorValue = 255),"gray50")
#E(g)$color <- ifelse(E(g)$linetype == "positive","red",rgb(0,147,0,maxColorValue = 255))
E(g)$color = ifelse(E(g)$r>0,rgb(254/255,67/255,101/255,abs(E(g)$r)),rgb(0/255,0/255,255/255,abs(E(g)$r)))
pdf("results/Correlation_network_analysis/network_group_graphopt3.pdf",family = "Times",width = 10,height = 12)
par(mar=c(5,2,1,2))
plot.igraph(g, layout=layout5,
vertex.color=V(g)$point.col,
#vertex.frame.color ="black",
vertex.border=V(g)$point.col,
#vertex.size=V(g)$degree*2,
vertex.size=6,
vertex.frame.color="white",
vertex.label=g$name,
vertex.label.cex=0.8,
vertex.label.dist=0,
vertex.label.degree = pi/2,
vertex.label.col="black",
#vertex.frame.color="transparent",
edge.arrow.size=0.5,
edge.width=abs(E(g)$r)*6,
#edge.curved = TRUE,
#edge.label.x = #
#edge.label.y = #
)
legend(
title = "Type",
list(x = min(layout1[,1])-0.2,
y = min(layout1[,2])-0.17),
legend = c(unique(V(g)$type)),
fill = color,
#pch=1
)
dev.off()
png
2
pdf("results/Correlation_network_analysis/network_group_graphopt3_legend.pdf",family = "Times",width = 10,height = 3)
color_legend = c(rgb(254/255,67/255,101/255,seq(1,0,by=-0.01)),rgb(0/255,0/255,255/255,seq(0,1,by=0.01)))
par(mar=c(2,2,1,2),xpd = T,cex.axis=1.6,las=1)
barplot(rep(1,length(color_legend)),border = NA, space = 0,ylab="",xlab="",xlim=c(1,length(color_legend)),horiz=FALSE,
axes = F, col=color_legend,main="")
axis(3,at=seq(1,length(color_legend),length=5),c(1,0.5,0,-0.5,-1),tick=FALSE)
dev.off()
png
2
参考:https://mp.weixin.qq.com/s/qJHibDjtbEqpqgsfrYdETw
# install.packages("BiocManager")
# library(BiocManager)
# install("remotes")
# install("tidyverse")
library(tidyverse)
# install("tidyfst")
library(tidyfst)
# install("igraph")
library(igraph)
# install("sna")
library(sna)
# install("phyloseq")
library(phyloseq)
# install("ggalluvial")
library(ggalluvial)
# install("ggraph")
library(ggraph)
# install("WGCNA")
library(WGCNA)
# install("ggnewscale")
library(ggnewscale)
# install("pulsar")
library(pulsar)
# install("patchwork")
library(patchwork)
# remotes::install_github("taowenmicro/EasyStat")
library(EasyStat)
# remotes::install_github("taowenmicro/ggClusterNet")
library(ggClusterNet)
library(phyloseq)
library(tidyverse)
library(igraph)
library(tidyfst)
#netpath = paste(otupath,"/network_stab/",sep = "")
#dir.create(netpath)
# Reading the raw file(原始文件读取)
metadata = read.delim("data/data.ps/metadata.tsv")
row.names(metadata) = metadata$SampleID
otutab = read.table("data/data.ps/otutab.txt", header=T, row.names=1, sep="\t", comment.char="", stringsAsFactors = F)
taxonomy = read.table("data/data.ps/taxonomy.txt", header=T, row.names=1, sep="\t", comment.char="", stringsAsFactors = F)
library(ggtree)
tree = read.tree("data/data.ps/otus.tree")
library(Biostrings)
rep = readDNAStringSet("data/data.ps/otus.fa")
# Import phyloseq package(导入phyloseq(ps)R包)
library(phyloseq)
ps = phyloseq(sample_data(metadata),
otu_table(as.matrix(otutab), taxa_are_rows=TRUE),
tax_table(as.matrix(taxonomy)),
phy_tree(tree),
refseq(rep)
)
ps
phyloseq-class experiment-level object
otu_table() OTU Table: [ 2432 taxa and 18 samples ]
sample_data() Sample Data: [ 18 samples by 11 sample variables ]
tax_table() Taxonomy Table: [ 2432 taxa by 7 taxonomic ranks ]
phy_tree() Phylogenetic Tree: [ 2432 tips and 2431 internal nodes ]
refseq() DNAStringSet: [ 2432 reference sequences ]
# Main function, used for network calculation and visualization
library(ggClusterNet)
library(phyloseq)
library(tidyverse)
data(ps)
otupath = "./"
# Set work directory
netpath = paste(otupath,"results/Correlation_network_analysis/network.new/",sep = "")
dir.create(netpath)
rank.names(ps)
[1] "Kingdom" "Phylum" "Class" "Order" "Family" "Genus" "Species"
library(ggrepel)
library(igraph)
#detach("package:MicrobiotaProcess")
# Network analysis main function
tab.r = network.pip(
ps = ps,
N = 400,
# ra = 0.05,
big = TRUE,
select_layout = FALSE,
layout_net = "model_maptree2",
r.threshold = 0.6,
p.threshold = 0.05,
maxnode = 2,
method = "spearman",
label = TRUE,
lab = "elements",
group = "Group",
fill = "Phylum",
size = "igraph.degree",
zipi = TRUE,
ram.net = TRUE,
clu_method = "cluster_fast_greedy",
step = 100,
R=10,
ncpus = 1
)
[1] "KO"
[1] "cor matrix culculating over"
[1] "OE"
[1] "cor matrix culculating over"
[1] "WT"
[1] "cor matrix culculating over"
# It is recommended to save the output results as R objects to save time by not performing correlation matrix calculations later.
saveRDS(tab.r,paste0(netpath,"network.pip.sparcc.rds"))
tab.r = readRDS(paste0(netpath,"network.pip.sparcc.rds"))
dat = tab.r[[2]]
cortab = dat$net.cor.matrix$cortab
# It is not easy to run a large correlation matrix, so it is recommended to save it to facilitate the calculation of various network properties.
saveRDS(cortab,paste0(netpath,"cor.matrix.all.group.rds"))
cor = readRDS(paste0(netpath,"cor.matrix.all.group.rds"))
# Extract the storage objects of all images
plot = tab.r[[1]]
# Extract network graph visualization results
p0 = plot[[1]]
ggsave(paste0(netpath,"plot.network.pdf"),p0,width = 12,height = 5)
ggsave(paste0(netpath,"plot.network2.pdf"),p0,width = 16,height = 10)
# zipi display
plot[[2]]
# Comparison with random networks
plot[[3]]
## Network attribute calculation - rich network attributes
i = 1
id = names(cor)
for (i in 1:length(id)) {
igraph= cor[[id[i]]] %>% make_igraph()
dat = net_properties.4(igraph,n.hub = F)
head(dat,n = 16)
colnames(dat) = id[i]
if (i == 1) {
dat2 = dat
} else{
dat2 = cbind(dat2,dat)
}
}
head(dat2)
KO OE
num.edges(L) "348" "261"
num.pos.edges "204" "174"
num.neg.edges "144" "87"
num.vertices(n) "279" "273"
Connectance(edge_density) "0.00897346638817978" "0.00702973497091144"
average.degree(Average K) "2.49462365591398" "1.91208791208791"
WT
num.edges(L) "258"
num.pos.edges "151"
num.neg.edges "107"
num.vertices(n) "278"
Connectance(edge_density) "0.00670077656286523"
average.degree(Average K) "1.85611510791367"
FileName <- paste(netpath,"net.network.attribute.data.csv", sep = "")
write.csv(dat2,FileName,quote = F)
## Node attribute calculation
for (i in 1:length(id)) {
igraph= cor[[id[i]]] %>% make_igraph()
nodepro = node_properties(igraph) %>% as.data.frame()
nodepro$Group = id[i]
head(nodepro)
colnames(nodepro) = paste0(colnames(nodepro),".",id[i])
nodepro = nodepro %>%
as.data.frame() %>%
rownames_to_column("ASV.name")
# head(dat.f)
if (i == 1) {
nodepro2 = nodepro
} else{
nodepro2 = nodepro2 %>% full_join(nodepro,by = "ASV.name")
}
}
head(nodepro2)
ASV.name igraph.degree.KO igraph.closeness.KO igraph.betweenness.KO
1 ASV_180 2 1 0
2 ASV_378 2 1 0
3 ASV_20 4 1 0
4 ASV_193 4 1 0
5 ASV_309 4 1 0
6 ASV_417 4 1 0
igraph.cen.degree.KO Group.KO igraph.degree.OE igraph.closeness.OE
1 2 KO 3 1
2 2 KO NA NA
3 4 KO NA NA
4 4 KO NA NA
5 4 KO NA NA
6 4 KO 1 1
igraph.betweenness.OE igraph.cen.degree.OE Group.OE igraph.degree.WT
1 0 3 OE 1
2 NA NA <NA> NA
3 NA NA <NA> 4
4 NA NA <NA> 2
5 NA NA <NA> 2
6 0 1 OE NA
igraph.closeness.WT igraph.betweenness.WT igraph.cen.degree.WT Group.WT
1 1 0 1 WT
2 NA NA NA <NA>
3 1 0 4 WT
4 1 0 2 WT
5 1 0 2 WT
6 NA NA NA <NA>
FileName <- paste(netpath,"net.node.attribute.data.sample.csv", sep = "")
write_csv(nodepro2,FileName)
## Customizable network output
dat = tab.r[[2]]
node = dat$net.cor.matrix$node
edge = dat$net.cor.matrix$edge
head(edge)
# A tibble: 6 × 12
X2 Y2 OTU_2 OTU_1 weight X1 Y1 cor group Group nodes label
<dbl> <dbl> <chr> <chr> <dbl> <dbl> <dbl> <chr> <chr> <chr> <int> <fct>
1 -37.9 -3.41 ASV_126 ASV_431 -1 -35.8 -4.83 - WT WT 278 WT: (n…
2 -37.9 -3.41 ASV_126 ASV_330 1 -38.0 -5.34 + WT WT 278 WT: (n…
3 -37.9 -3.41 ASV_126 ASV_380 1 -36.3 -2.93 + WT WT 278 WT: (n…
4 -35.8 -4.83 ASV_431 ASV_330 -1 -38.0 -5.34 - WT WT 278 WT: (n…
5 -35.8 -4.83 ASV_431 ASV_380 -1 -36.3 -2.93 - WT WT 278 WT: (n…
6 -38.0 -5.34 ASV_330 ASV_380 1 -36.3 -2.93 + WT WT 278 WT: (n…
head(node)
X1 X2 elements igraph.degree igraph.closeness
1 10.907726 -9.455050 ASV_1 4 1
2 6.602809 -18.194949 ASV_10 2 1
3 -28.228285 5.342999 ASV_100 0 0
4 3.737447 -29.250730 ASV_101 1 1
5 -27.083427 3.213331 ASV_102 1 1
6 -23.166028 5.072746 ASV_103 1 1
igraph.betweenness igraph.cen.degree group ID Kingdom Phylum
1 0 4 KO ASV_1 Bacteria Actinobacteria
2 0 2 KO ASV_10 Bacteria Proteobacteria
3 0 0 KO ASV_100 Bacteria Proteobacteria
4 0 1 KO ASV_101 Bacteria Proteobacteria
5 0 1 KO ASV_102 Bacteria Proteobacteria
6 0 1 KO ASV_103 Bacteria Proteobacteria
Class Order Family Genus
1 Actinobacteria Actinomycetales Thermomonosporaceae Unassigned
2 Alphaproteobacteria Rhizobiales Rhizobiaceae Rhizobium
3 Betaproteobacteria Burkholderiales Comamonadaceae Hydrogenophaga
4 Unassigned Unassigned Unassigned Unassigned
5 Gammaproteobacteria Unassigned Unassigned Unassigned
6 Alphaproteobacteria Rhizobiales Phyllobacteriaceae Phyllobacterium
Species Group nodes label
1 Unassigned KO 279 KO: (nodes: 279; links: )
2 Unassigned KO 279 KO: (nodes: 279; links: )
3 Hydrogenophaga_intermedia KO 279 KO: (nodes: 279; links: )
4 Unassigned KO 279 KO: (nodes: 279; links: )
5 Unassigned KO 279 KO: (nodes: 279; links: )
6 Phyllobacterium_bourgognense KO 279 KO: (nodes: 279; links: )
#node2 = add.id.facet(node,"Group")
#head(node2)
p <- ggplot() + geom_segment(aes(x = X1, y = Y1, xend = X2, yend = Y2,color = cor),
data = edge, size = 0.03,alpha = 0.1) +
geom_point(aes(X1, X2,
fill = Phylum,
size = igraph.degree),
pch = 21, data = node,color = "gray40") +
facet_wrap(.~ label,scales="free_y",nrow = 1) +
# geom_text_repel(aes(X1, X2,label = elements),pch = 21, data = nodeG) +
# geom_text(aes(X1, X2,label = elements),pch = 21, data = nodeG) +
scale_colour_manual(values = c("#6D98B5","#D48852")) +
# scale_fill_hue()+
scale_size(range = c(0.8, 5)) +
scale_x_continuous(breaks = NULL) +
scale_y_continuous(breaks = NULL) +
theme(panel.background = element_blank(),
plot.title = element_text(hjust = 0.5)
) +
theme(axis.title.x = element_blank(),
axis.title.y = element_blank()
) +
theme(legend.background = element_rect(colour = NA)) +
theme(panel.background = element_rect(fill = "white", colour = NA)) +
theme(panel.grid.minor = element_blank(), panel.grid.major = element_blank())
p
# Zipi Visualization-Customization
dat.z = dat$zipi.data
head(dat.z)
z module p roles label role_7 group id
KO.ASV_180 0 42 0 Peripherals ultra peripheral KO ASV_180
KO.ASV_378 0 42 0 Peripherals ultra peripheral KO ASV_378
KO.ASV_2 0 42 0 Peripherals ultra peripheral KO ASV_2
KO.ASV_20 0 10 0 Peripherals ultra peripheral KO ASV_20
KO.ASV_193 0 10 0 Peripherals ultra peripheral KO ASV_193
KO.ASV_309 0 10 0 Peripherals ultra peripheral KO ASV_309
x1<- c(0, 0.62,0,0.62)
x2<- c( 0.62,1,0.62,1)
y1<- c(-Inf,2.5,2.5,-Inf)
y2 <- c(2.5,Inf,Inf,2.5)
lab <- c("peripheral",'Network hubs','Module hubs','Connectors')
roles.colors <- c("#E6E6FA","#DCDCDC","#F5FFFA", "#FAEBD7")
tab = data.frame(x1 = x1,y1 = y1,x2 = x2,y2 = y2,lab = lab)
tem = dat.z$group %>% unique() %>% length()
for ( i in 1:tem) {
if (i == 1) {
tab2 = tab
} else{
tab2 = rbind(tab2,tab)
}
}
p <- ggplot() +
geom_rect(data=tab2,
mapping=aes(xmin=x1,
xmax=x2,
ymin=y1,
ymax=y2,
fill = lab))+
guides(fill=guide_legend(title="Topological roles")) +
scale_fill_manual(values = roles.colors)+
geom_point(data=dat.z,aes(x=p, y=z,color=module)) + theme_bw()+
guides(color= F) +
ggrepel::geom_text_repel(data = dat.z,
aes(x = p, y = z,
color = module,label=label),size=4)+
# facet_wrap(.~group) +
facet_grid(.~ group, scale='free') +
theme(strip.background = element_rect(fill = "white"))+
xlab("Participation Coefficient")+ylab(" Within-module connectivity z-score")
p
# Random networks, power-law distribution
dat.r = dat$random.net.data
p3 <- ggplot(dat.r) +
geom_point(aes(x = ID,y = network,
group =group,fill = group),pch = 21,size = 2) +
geom_smooth(aes(x = ID,y = network,group =group,color = group))+
facet_grid(.~g,scales = "free") +
theme_bw() + theme(
plot.margin=unit(c(0,0,0,0), "cm")
)
p3
# Multi-network comparison-network saliency
dat = module.compare.net.pip(
ps = NULL,
corg = cor,
degree = TRUE,
zipi = FALSE,
r.threshold= 0.8,
p.threshold=0.05,
method = "spearman",
padj = F,
n = 3)
res = dat[[1]]
head(res)
module1 module2 Both P1A2 P2A1 A1A2 p_raw p_adj
Var1 KO OE 165 114 108 59 0.8965031 0.8965031
Var11 KO WT 160 119 118 49 0.9983120 0.9983120
Var12 OE WT 174 99 104 69 0.2515650 0.2515650
FileName <- paste(netpath,"net.compare.diff.sig.csv", sep = "")
write.csv(res,FileName,quote = F)
## Network stability-module comparison
library(tidyfst)
res1 = module.compare.m(
ps = NULL,
corg = cor,
zipi = FALSE,
zoom = 0.2,
padj = F,
n = 3)
[1] 36 8
[1] 67 8
[1] 86 8
[1] 0
[1] 1
[1] 2
[1] 3
[1] 0
[1] 1
[1] 2
[1] 3
[1] 4
[1] 0
[1] 1
[1] 2
[1] 3
[1] 4
# Different groups are shown using a circle. A dot in the circle represents a module. Connected modules represent similar modules. p1 = res1[[1]]
p1
# Extract the corresponding information of OTU, grouping, etc. of the module
dat1 = res1[[2]]
head(dat1)
ID group Group
1 ASV_62 KOmodel_1 KO
2 ASV_55 KOmodel_1 KO
3 ASV_122 KOmodel_1 KO
4 ASV_224 KOmodel_1 KO
5 ASV_248 KOmodel_1 KO
6 ASV_395 KOmodel_1 KO
# Module similarity result table
dat2 = res1[[3]]
head(dat2)
module1 module2 Both P1A2 P2A1 A1A2
KOmodel_32-OEmodel_4 KOmodel_32 OEmodel_4 1 2 4 380
KOmodel_40-OEmodel_4 KOmodel_40 OEmodel_4 1 2 4 380
KOmodel_1-OEmodel_12 KOmodel_1 OEmodel_12 2 5 2 378
KOmodel_19-OEmodel_9 KOmodel_19 OEmodel_9 1 3 3 380
KOmodel_25-OEmodel_8 KOmodel_25 OEmodel_8 1 3 3 380
KOmodel_28-OEmodel_8 KOmodel_28 OEmodel_8 1 2 3 381
p_raw p_adj
KOmodel_32-OEmodel_4 0.0383590783638981 0.0383590783638981
KOmodel_40-OEmodel_4 0.0383590783638981 0.0383590783638981
KOmodel_1-OEmodel_12 0.00165785308197045 0.00165785308197045
KOmodel_19-OEmodel_9 0.0408633515973255 0.0408633515973255
KOmodel_25-OEmodel_8 0.0408633515973255 0.0408633515973255
KOmodel_28-OEmodel_8 0.0307671763509816 0.0307671763509816
dat2$m1 = dat2$module1 %>% strsplit("model") %>%
sapply(`[`, 1)
dat2$m2 = dat2$module2 %>% strsplit("model") %>%
sapply(`[`, 1)
dat2$cross = paste(dat2$m1,dat2$m2,sep = "_Vs_")
# head(dat2)
dat2 = dat2 %>% filter(module1 != "none")
p2 = ggplot(dat2) + geom_bar(aes(x = cross,fill = cross)) +
labs(x = "",
y = "numbers.of.similar.modules"
)+ theme_classic()
p2
# It is found that the networks of group 1 and group 3 are more similar
FileName <- paste(netpath,"module.compare.groups.pdf", sep = "")
ggsave(FileName, p1, width = 10, height = 10)
FileName <- paste(netpath,"numbers.of.similar.modules.pdf", sep = "")
ggsave(FileName, p2, width = 8, height = 8)
FileName <- paste(netpath,"module.otu.csv", sep = "")
write.csv(dat1,FileName, quote = F)
FileName <- paste(netpath,"module.compare.groups.csv", sep = "")
write.csv(dat2,FileName, quote = F)
## Network stability-robustness
# Robust calculations require species richness, so even if the correlation matrix is calculated, a ps object must be entered
# Network stability - Removing key nodes - Network robustness
library(patchwork)
conflicts_prefer(dplyr::desc)
conflicts_prefer(ggplot2::theme_light)
res2= Robustness.Targeted.removal(ps = ps,
corg = cor,
degree = TRUE,
zipi = FALSE
)
p3 = res2[[1]]
p3
# Extracting data
dat4 = res2[[2]]
# dir.create("./Robustness_Random_removal/")
path = paste(netpath,"/Robustness_Random_removal/",sep = "")
fs::dir_create(path)
write.csv(dat4,
paste(path,"random_removal_network.csv",sep = ""))
ggsave(paste(path,"random_removal_network.pdf",sep = ""), p3,width = 8,height = 4)
# N etwork stability - random removal of any proportion of nodes - network robustness
res3 = Robustness.Random.removal(ps = ps,
corg = cortab,
Top = 0
)
p4 = res3[[1]]
p4
# Extracting data
dat5 = res3[[2]]
# head(dat5)
path = paste(netpath,"/Robustness_Targeted_removal/",sep = "")
fs::dir_create(path)
write.csv(dat5,
paste(path,"Robustness_Targeted_removal_network.csv",sep = ""))
ggsave(paste(path,"Robustness_Targeted_removal_network.pdf",sep = ""), p4,width = 8,height = 4)
## Network stability-negative correlation ratio
# Network stability - calculate the proportion of negative correlations
res4 = negative.correlation.ratio(ps = ps,
corg = cortab,
# Top = 500,
degree = TRUE,
zipi = FALSE)
p5 = res4[[1]]
p5
dat6 = res4[[2]]
# Negatively correlated ratio data
# head(dat6)
path = paste(netpath,"/Vulnerability/",sep = "")
fs::dir_create(path)
write.csv(dat6,
paste(path,"Vnegative.correlation.ratio_network.csv",sep = ""))
ggsave(paste(path,"negative.correlation.ratio_network.pdf",sep = ""), p5,width = 4,height = 4)
## Network stability-community stability
# Network stability - community stability - only used for pair samples
treat = ps %>% sample_data()
treat$pair = paste( "A",c(rep(1:6,3)),sep = "")
# head(treat)
sample_data(ps) = treat
# In general, there is no time gradient. Here, time is set to F, which means that every combination of two communities is compared.
res5 = community.stability( ps = ps,
corg = cor,
time = FALSE)
p6 = res5[[1]]
p6
dat7 = res5[[2]]
path = paste(netpath,"/community.stability/",sep = "")
fs::dir_create(path)
write.csv(dat7,
paste(path,"community.stability.data.csv",sep = ""))
ggsave(paste(path,"community.stability..boxplot.pdf",sep = ""), p6,width = 4,height = 4)
# Network stability-network survivability
library("pulsar")
res6 = natural.con.microp (
ps = ps,
corg = cor,
norm = TRUE,
end = 150,# Smaller than the number of nodes in the network
start = 0
)
p7 = res6[[1]]
p7
dat8 = res6[[2]]
path = paste(netpath,"/Natural_connectivity/",sep = "")
fs::dir_create(path)
write.csv(dat8,
paste(path,"/Natural_connectivity.csv",sep = ""))
ggsave(paste(path,"/Natural_connectivity.pdf",sep = ""), p7,width = 5,height = 4)
## Network modularity analysis-module feature vector
id = sample_data(ps)$Group %>% unique()
id
[1] KO OE WT
Levels: KO OE WT
i = 1
netpath = paste(otupath,"/network3_MEs/",sep = "")
dir.create(netpath)
netpath;library(ggClusterNet)
[1] ".//network3_MEs/"
library(igraph)
for (i in 1:length(id)) {
ps.1 = phyloseq::subset_samples(
ps,Group %in% c(id[i])
)
result = network.2(ps = ps.1, N = 500,
big = TRUE,
select_layout = TRUE,
layout_net = "model_maptree",
r.threshold=0.8,
p.threshold=0.05,
label = FALSE,
path = netpath,
zipi = F,
ncol = 1,
nrow = 1,
# method = "sparcc",
fill = "Phylum"
)
# The output group of the node's modularity information is listed as modularity information
tem <- ggClusterNet::model_maptree(cor =result[[4]],
method = "cluster_fast_greedy",
seed = 12
)
node_model = tem[[2]]
head(node_model)
tablename <- paste(netpath,"/node_model",".csv",sep = "")
write.csv(node_model,tablename)
head(node_model)
otu = ps.1 %>%
filter_OTU_ps(500) %>%
vegan_otu() %>%
as.data.frame()
node_model = node_model[match(colnames(otu),node_model$ID),]
MEList = WGCNA::moduleEigengenes(otu, colors = node_model$group)
MEs = MEList$eigengenes %>% as.data.frame()
tablename <- paste(netpath,"/",id[i],"node_characteristic_variables",".csv",sep = "")
write.csv(MEs,tablename)
}
[1] "KO"
[1] "cor matrix culculating over"
[1] "OE"
[1] "cor matrix culculating over"
[1] "WT"
[1] "cor matrix culculating over"
#detach("package:igraph")
# TOPOSCORE paper - Analysis script
# Reference: Derosa, L., et al. (2024). "Custom scoring based on ecological topology of gut microbiota associated with cancer immunotherapy outcome." Cell 187(13): 3373-3389 e3316.
# load required libraries and helper functions
source('data/data_TOPOSCORE/tp_helper.R')
## 1. Discovery analysis set ----
log_msg('####### Discovery analysis set #########')
clin_disc <- load_clin(cohort = 'Disc')
met4_disc <- load_microbiome(clin_disc)
conflicts_prefer(base::order)
### 1.1 CoxPH screen (except Akkermansia) ----
res_surv <- load_or_compute('results/Correlation_network_analysis/TOPOSCORE/res_surv2.rds',
screen_surv_met4(clin_disc, met4_disc, type = 'OS'))
plot_surv_forest(res_surv, alpha = 0.05)
# select species based on average HR:
res_surv_filt <- res_surv %>% dplyr::filter(HR <= 0.8 | HR >= 1.25)
selected_species <- res_surv_filt$SPECIES
log_msg('%d/%d species selected', length(selected_species), nrow(res_surv))
plot_surv_forest(res_surv_filt %>% dplyr::arrange(HR))
met4_disc_filt <- met4_disc[, c('Sample_id', selected_species)]
hr_annots <- res_surv_filt %>% mutate(HRCAT = ifelse(HR < 1, 'R', 'NR')) %>%
dplyr::select(HRCAT, SPECIES)
### 1.2 Correlation screen ----
res <- load_or_compute('results/Correlation_network_analysis/TOPOSCORE/res_pairs2.rds', screen_pairs(met4_disc_filt))
# filter based on Fisher Bonferroni-corrected p-values <= 0.05
res_filt <- load_or_compute('results/Correlation_network_analysis/TOPOSCORE/res_pairs_filt2.rds', {
min_p <- bind_rows(list(
res %>% dplyr::select(VAR = VAR1, FISHER_P),
res %>% dplyr::select(VAR = VAR2, FISHER_P)
)) %>% group_by(VAR) %>% summarize(MIN_P = min(FISHER_P)) %>% arrange(-MIN_P)
sp2 <- min_p[min_p$MIN_P <= 0.05 / nrow(min_p), 'VAR', drop = TRUE]
res %>% filter(VAR1 %in% sp2 & VAR2 %in% sp2)
})
log_msg('Keeping %d pairs (%d species)', nrow(res_filt),
length(unique(c(res_filt$VAR1, res_filt$VAR2))))
### 1.3 Clustering ----
SCORE <- 'fisher_p'
METHOD <- 'ward.D2'
DISTANCE <- 'manhattan'
cc <- cluster_species(res_filt, score = SCORE, method = METHOD, distance = DISTANCE, k = 7) %>% renumber_clusters()
plt_fisher_disc <- plot_score_matrix(res_filt, score = SCORE, method = METHOD,
distance = DISTANCE, annots = list(cc, hr_annots), fontsize = 3)
ggsave(plt_fisher_disc, filename = 'results/Correlation_network_analysis/TOPOSCORE/fig_fisher_disc.pdf', width = 30, height = 20, units = "cm")
### 1.4 Definition of SIGB groups ----
cc_names <- unique(cc$CLUSTER)
cc_species <- setNames(lapply(cc_names, function(x) cc[cc$CLUSTER == x, 'SPECIES', drop = TRUE]), cc_names)
SIGB1 <- load_or_compute('results/Correlation_network_analysis/TOPOSCORE/sigb12.rds', c(cc_species$C5, cc_species$C6))
SIGB2 <- load_or_compute('results/Correlation_network_analysis/TOPOSCORE/sigb22.rds', c(cc_species$C1, cc_species$C2, cc_species$C3))
### 1.5 Toposcoring ----
scores_disc <- compute_toposcore(met4_disc, sigb1 = SIGB1, sigb2 = SIGB2)
pred_disc <- clin_disc %>% left_join(scores_disc, by = 'Sample_id') %>%
filter(OS12 != '') %>% mutate(OS12 = factor(OS12, levels = c('NR', 'R')))
roc <- calc_roc(pred_disc$OS12, pred_disc$TOPOB01, verbose = TRUE)
log_msg('ROC AUC = %.2f [%.2f - %.2f]', roc$AUC[1], roc$AUC[2], roc$AUC[3])
youden <- roc$ROC_DF %>% mutate(SENS = TPR, SPEC = 1 - FPR) %>% mutate(J = SENS + SPEC - 1)
ggplot(youden, aes(x = THRESHOLD, y = J)) + geom_point()
ycut_nr <- youden[which.max(youden$J), ] # 0.5351351
ycut_r <- youden[which(youden$THRESHOLD > 0.7 & youden$J > 0.23), ] # 0.7911411
log_msg('Cut-off thresholds = %.4f and %.4f', ycut_nr$THRESHOLD, ycut_r$THRESHOLD)
plt_roc <- plot_roc(roc$ROC_DF) +
geom_point(data = ycut_nr, color = 'red') +
geom_point(data = ycut_r, color = 'green')
ggsave(plt_roc, filename = 'results/Correlation_network_analysis/TOPOSCORE/fig_roc.pdf', width = 10, height = 10, units = "cm")
plt_kde_disc <- plot_toposcoreb01_density(scores_disc, clin_disc,
lims = c(ycut_r$THRESHOLD, ycut_nr$THRESHOLD))
ggsave(plt_kde_disc, filename = 'results/Correlation_network_analysis/TOPOSCORE/fig_kde_disc.pdf', width = 10, height = 10, units = "cm")
### 1.6 Prediction in discovery cohort (full signature) ----
pred_disc <- assign_prediction(pred_disc, ycut_r$THRESHOLD, ycut_nr$THRESHOLD)
hr_disc <- get_hr(pred_disc, type = 'OS', by = 'PRED')
log_msg('Prediction discovery: HR = %.2f [%.2f-%.2f], p = %.1e', hr_disc[1], hr_disc[2], hr_disc[3], hr_disc[4])
fig_km_disc <- print_plot(plot_mykm(pred_disc, type = 'OS', by = 'PRED'))
ggsave(fig_km_disc, filename = 'results/Correlation_network_analysis/TOPOSCORE/fig_km_disc.pdf', width = 10, height = 10, units = "cm")
library("survival")
library("survminer")
km_fit <- survfit(Surv(OS, Death) ~ PRED, data = pred_disc)
# Plotting the Kaplan-Meier curve
ggsurvplot(km_fit, data = pred_disc, pval = TRUE, conf.int = TRUE,
xlab = "Time", ylab = "Survival Probability",
palette = c("#E7B800", "#2E9FDF"))
pdf(file="results/Correlation_network_analysis/TOPOSCORE/fig_km_disc2.pdf",width = 10,height = 7)
km_fit <- survfit(Surv(OS, Death) ~ PRED, data = pred_disc)
ggsurvplot(km_fit, data = pred_disc,
pval = TRUE, conf.int = TRUE,
risk.table = TRUE, # Add risk table
risk.table.col = "strata", # Change risk table color by groups
linetype = "strata", # Change line type by groups
surv.median.line = "hv", # Specify median survival
ggtheme = theme_bw(), # Change ggplot2 theme
palette = c("#E7B800", "#2E9FDF"))
dev.off()
png
2
### 1.7 Prediction in discovery cohort (short signature) ----
SIGB1_PCR <- c('Enterocloster_bolteae', 'Clostridium_symbiosum', 'Erysipelatoclostridium_ramosum',
'Hungatella_hathewayi', 'Veillonella_atypica')
SIGB2_PCR <- c('Anaerostipes_hadrus', 'Blautia_wexlerae', 'Coprococcus_comes',
'Dorea_formicigenerans', 'Dorea_longicatena', 'Eubacterium_rectale',
'Eubacterium_ventriosum', 'Faecalibacterium_prausnitzii',
'Gemmiger_formicilis', 'Phocaeicola_massiliensis', 'Roseburia_hominis',
'Roseburia_intestinalis', 'Roseburia_inulinivorans',
'Ruminococcus_bicirculans', 'Ruminococcus_lactaris')
scores_disc_short <- compute_toposcore(met4_disc, sigb1 = SIGB1_PCR, sigb2 = SIGB2_PCR)
pred_disc_short <- clin_disc %>% left_join(scores_disc_short, by = 'Sample_id') %>% assign_prediction(ycut_r$THRESHOLD, ycut_nr$THRESHOLD)
hr_disc_short <- get_hr(pred_disc_short, type = 'OS', by = 'PRED')
log_msg('Prediction discovery (short): HR = %.2f [%.2f-%.2f], p = %.1e', hr_disc_short[1], hr_disc_short[2], hr_disc_short[3], hr_disc_short[4])
fig_km_disc_short <- print_plot(plot_mykm(pred_disc_short, type = 'OS', by = 'PRED'))
#ggsave(fig_km_disc_short, filename = 'results/TOPOSCORE/fig_km_disc_short.pdf', width = 10, height = 10, units = "cm")
pdf(file="results/Correlation_network_analysis/TOPOSCORE/fig_km_disc_short2.pdf",width = 10,height = 10)
km_fit2 <- survfit(Surv(OS, Death) ~ PRED, data = pred_disc_short)
ggsurvplot(km_fit2, data = pred_disc_short,
pval = TRUE, conf.int = TRUE,
risk.table = TRUE, # Add risk table
risk.table.col = "strata", # Change risk table color by groups
linetype = "strata", # Change line type by groups
surv.median.line = "hv", # Specify median survival
ggtheme = theme_bw(), # Change ggplot2 theme
palette = c("#E7B800", "#2E9FDF"))
dev.off()
png
2
## 2. Validation analysis set ----
log_msg('####### Validation analysis set #########')
clin_valid <- load_clin(cohort = 'Valid')
met4_valid <- load_microbiome(clin_valid)
### 2.1 Toposcoring ----
scores_valid <- compute_toposcore(met4_valid, sigb1 = SIGB1, sigb2 = SIGB2)
plt_kde_valid <- plot_toposcoreb01_density(scores_valid, clin_valid, lims = c(ycut_r$THRESHOLD, ycut_nr$THRESHOLD))
ggsave(plt_kde_valid, filename = 'results/Correlation_network_analysis/TOPOSCORE/fig_kde_valid.pdf', width = 10, height = 10, units = "cm")
### 2.2 Prediction in validation cohort ----
pred_valid <- clin_valid %>% left_join(scores_valid, by = 'Sample_id') %>%
assign_prediction(ycut_r$THRESHOLD, ycut_nr$THRESHOLD)
hr_valid <- get_hr(pred_valid, type = 'OS', by = 'PRED')
log_msg('Prediction validation: HR = %.2f [%.2f-%.2f], p = %.1e', hr_valid[1], hr_valid[2], hr_valid[3], hr_valid[4])
fig_km_valid <- print_plot(plot_mykm(pred_valid, type = 'OS', by = 'PRED'))
ggsave(fig_km_valid, filename = 'results/Correlation_network_analysis/TOPOSCORE/fig_km_valid.pdf', width = 10, height = 10, units = "cm")
pdf(file="results/Correlation_network_analysis/TOPOSCORE/fig_km_valid2.pdf",width = 10,height = 10)
km_fit3 <- survfit(Surv(OS, Death) ~ PRED, data = pred_valid)
ggsurvplot(km_fit3, data = pred_valid,
pval = TRUE, conf.int = TRUE,
risk.table = TRUE, # Add risk table
risk.table.col = "strata", # Change risk table color by groups
linetype = "strata", # Change line type by groups
surv.median.line = "hv", # Specify median survival
ggtheme = theme_bw(), # Change ggplot2 theme
palette = c("#E7B800", "#2E9FDF"))
dev.off()
png
2
### 2.3 Prediction in validation cohort (short signature) ----
scores_valid_short <- compute_toposcore(met4_valid, sigb1 = SIGB1_PCR, sigb2 = SIGB2_PCR)
pred_valid_short <- clin_valid %>% left_join(scores_valid_short, by = 'Sample_id') %>% assign_prediction(ycut_r$THRESHOLD, ycut_nr$THRESHOLD)
hr_valid_short <- get_hr(pred_valid_short, type = 'OS', by = 'PRED')
log_msg('Prediction validation (short): HR = %.2f [%.2f-%.2f], p = %.1e', hr_valid_short[1], hr_valid_short[2], hr_valid_short[3], hr_valid_short[4])
fig_km_valid_short <- print_plot(plot_mykm(pred_valid_short, type = 'OS', by = 'PRED'))
ggsave(fig_km_valid_short, filename = 'results/Correlation_network_analysis/TOPOSCORE/fig_km_valid_short.pdf', width = 10, height = 10, units = "cm")
pdf(file="results/Correlation_network_analysis/TOPOSCORE/fig_km_valid_short2.pdf",width = 10,height = 10)
km_fit4 <- survfit(Surv(OS, Death) ~ PRED, data = pred_valid_short)
ggsurvplot(km_fit4, data = pred_valid_short,
pval = TRUE, conf.int = TRUE,
risk.table = TRUE, # Add risk table
risk.table.col = "strata", # Change risk table color by groups
linetype = "strata", # Change line type by groups
surv.median.line = "hv", # Specify median survival
ggtheme = theme_bw(), # Change ggplot2 theme
palette = c("#E7B800", "#2E9FDF"))
dev.off()
png
2
source("./function/runRefBlast2.R")
otupath = "./"
funcpath = paste(otupath,"results/Function_Prediction/Tax4Fun2/",sep = "")
dir.create(funcpath)
library(Tax4Fun2)
## 首先配置数据库-可能会经常失败,建议使用这里的备份库
## First configure the database - it may fail frequently, it is recommended to use the backup library here
# buildReferenceData(path_to_working_directory = '.', use_force = FALSE, install_suggested_packages = TRUE)
# Database download link is https://zenodo.org/records/10035668
path_to_reference_data = "./results/Function_Prediction/Tax4Fun2/Tax4Fun2_ReferenceData_v2"
# blast_bin = file.path(path_to_reference_data, "blast_bin/bin/blastn.exe")
# res = system(command = paste(blast_bin, "-help"), intern = T)
otudir = funcpath
# Load package
library(Tax4Fun2)
# 物种注释(Species annotation)
# 指定 OTU 代表序列、Tax4Fun2 库的位置、参考数据库版本、序列比对(blastn)线程数等
# Specify OTU representative sequence, Tax4Fun2 library location, reference database version, sequence alignment (blastn) thread number, etc.
# runRefBlast2(path_to_otus = './data/data.ps/otu.fa',
# path_to_reference_data = path_to_reference_data,
# path_to_temp_folder = otudir, database_mode = 'Ref100NR',
# use_force = TRUE, num_threads = 4)
# runRefBlast2(path_to_otus = './data/data.ps/otu.fa',
# path_to_reference_data = path_to_reference_data,
# path_to_temp_folder = otudir, database_mode = 'Ref100NR',
# use_force = TRUE, num_threads = 4)
# 预测群落功能(Predicting community function)
# 指定 OTU 丰度表、Tax4Fun2 库的位置、参考数据库版本、上步的物种注释结果路径等
# Specify the OTU abundance table, the location of the Tax4Fun2 library, the reference database version, the species annotation result path of the previous step, etc.
# makeFunctionalPrediction(path_to_otu_table = './data/data.ps/otutab.txt',
# path_to_reference_data = path_to_reference_data,
# path_to_temp_folder = otudir,
# database_mode = 'Ref100NR',
# normalize_by_copy_number = TRUE,
# min_identity_to_reference = 0.97,
# normalize_pathways = FALSE)
# Load packages
# BiocManager::install("microeco")
library(EasyStat)
library(microeco)
library(ggplot2)
library("WGCNA")
library(tidyverse)
library(ggtree)
library(SpiecEasi)
library(ggClusterNet)
library(phyloseq)
library(magrittr)
p_list = c("ggplot2", "BiocManager", "devtools","picante", "GUniFrac", "ggalluvial", "rgexf")
for(p in p_list){if (!requireNamespace(p)){install.packages(p)}
library(p, character.only = TRUE, quietly = TRUE, warn.conflicts = F)}
ps = readRDS("./data/dataNEW/ps_ITS.rds")
# ps = ps
# 导入内置真菌数据(Import built-in fungal data)
# data(sample_info_ITS)
# data(otu_table_ITS)
# data(taxonomy_table_ITS)
otu = ps %>% vegan_otu() %>%
t() %>%
as.data.frame()
tax = ps %>% vegan_tax() %>%
as.data.frame()
# 构建分析对象(Constructing analysis objects)
dataset = microtable$new(sample_table = sample_data(ps), otu_table = otu, tax_table = tax)
# 筛选真菌(Screening fungi)
dataset$tax_table %<>% subset(Kingdom == "Fungi")
# 功能预测
t2 = trans_func$new(dataset)
# 计算物种的功能
t2$cal_spe_func()
data = t2$res_spe_func_raw_funguild
fugipath = paste(otupath,"results/Function_Prediction/funguild",sep = "")
dir.create(fugipath)
# dir.create("./result_and_plot/Base_diversity_ITS/OTU_220921//funguild")
write.csv(data,paste(fugipath ,"/funguild.csv",sep = ""))
## FAPROTAX
# install.packages("VGAM")
ps = readRDS("./data/dataNEW/ps_16s.rds")
# Load packages
library(microeco)
library(ggplot2)
library("WGCNA")
library(tidyverse)
library(ggtree)
library("SpiecEasi")
library(ggClusterNet)
library(phyloseq)
library(magrittr)
# ps = readRDS("./data/dataNEW/ps_16s.rds")
ps = ps %>%
filter_OTU_ps(500)
otu = ps %>% vegan_otu() %>%
t() %>%
as.data.frame()
tax = ps %>% vegan_tax() %>%
as.data.frame()
# 构建分析对象(Constructing analysis objects)
dataset = microtable$new(sample_table = sample_data(ps), otu_table = otu, tax_table = tax)
t2 = trans_func$new(dataset)
t2$cal_spe_func()
t2$res_spe_func[1:5, 1:6]
methanotrophy acetoclastic_methanogenesis
ASV_662 0 0
ASV_782 0 0
ASV_874 0 0
ASV_223 0 0
ASV_508 0 0
methanogenesis_by_disproportionation_of_methyl_groups
ASV_662 0
ASV_782 0
ASV_874 0
ASV_223 0
ASV_508 0
methanogenesis_using_formate methanogenesis_by_CO2_reduction_with_H2
ASV_662 0 0
ASV_782 0 0
ASV_874 0 0
ASV_223 0 0
ASV_508 0 0
methanogenesis_by_reduction_of_methyl_compounds_with_H2
ASV_662 0
ASV_782 0
ASV_874 0
ASV_223 0
ASV_508 0
data = t2$res_spe_func
data = data[rowSums(data)> 0,]
betapath = paste(otupath,"results/Function_Prediction/FAPROTAX/",sep = "")
dir.create(betapath)
# dir.create("./result_and_plot/Base_diversity_16s//OTU_220921//FAPROTAX")
write.csv(data,paste(betapath,"/FAPROTAX.csv",sep = ""))
# View Function Group List
t2$func_group_list
$FAPROTAX
$FAPROTAX$`Energy source`
[1] "aerobic_chemoheterotrophy" "anaerobic_chemoheterotrophy"
[3] "photoautotrophy" "photoheterotrophy"
$FAPROTAX$`C-cycle`
[1] "cellulolysis" "xylanolysis"
[3] "chitinolysis" "ligninolysis"
[5] "fermentation" "methanogenesis"
[7] "methanotrophy" "methylotrophy"
[9] "hydrocarbon_degradation" "oil_bioremediation"
$FAPROTAX$`N-cycle`
[1] "nitrogen_fixation" "nitrification"
[3] "aerobic_ammonia_oxidation" "aerobic_nitrite_oxidation"
[5] "nitrate_reduction" "nitrate_respiration"
[7] "nitrite_respiration" "nitrogen_respiration"
[9] "denitrification" "nitrite_denitrification"
[11] "nitrate_denitrification" "nitrous_oxide_denitrification"
[13] "ureolysis"
$FAPROTAX$`S-cycle`
[1] "sulfate_respiration" "sulfur_respiration"
[3] "sulfite_respiration" "dark_sulfide_oxidation"
[5] "respiration_of_sulfur_compounds" "thiosulfate_respiration"
[7] "dark_oxidation_of_sulfur_compounds"
# View a category
t2$show_prok_func("methanotrophy")
[1] "elements:C,H; main_element:C; electron_donor:C; electron_acceptor:variable; aerobic:variable; exclusively_prokaryotic:yes; light_dependent:no"
# load packages
# rm(list = ls())
library(vegan)
library(ggplot2)
library(ggpubr)
library(ggrepel)
library(rdacca.hp)
# load data
# load species data
otu = read.csv('data/species_data.csv', head = T, row.names=1)
otu <- data.frame(t(otu))
# group data
matadata <- read.table(paste("data/group_data.txt",sep=""), header=T, row.names=1, sep="\t", comment.char="")
otu = otu[rownames(otu) %in% rownames(matadata), ]
# confounding factors
env = read.csv('data/c_index_species_new4.csv', header = T, row.names=1)
env = na.omit(env)
env = env[rownames(env) %in% rownames(matadata), ]
rownames = rownames(env)
rownames = as.data.frame(rownames)
otu$rownames = rownames(otu)
otu = merge(otu, rownames, by = "rownames")
rownames(otu) = otu$rownames
otu = otu[, -1]
## Calculate db-RDA step by step according to the principle
# Calculate the sample distance, taking Bray-curtis distance as an example, details ?vegdist
# dis_bray <- vegdist(otu, method = 'bray')
distance_mat = read.table(paste("data/Species_beta_diversity2.txt",sep=""), header=T, row.names=1, sep="\t", comment.char="")
metadata2 = t(matadata)
distance_mat2 = distance_mat[rownames(distance_mat) %in% rownames(matadata), ]
distance_mat3 = distance_mat2[, colnames(distance_mat2) %in% colnames(metadata2)]
distance_mat = distance_mat3
# Or directly use the existing distance matrix, which is also the Bray-curtis distance
dis_bray <- as.dist(distance_mat)
# PCoA sorting, here add = TRUE is used to correct negative eigenvalues, details ?cmdscale
pcoa <- cmdscale(dis_bray, k = nrow(otu) - 1, eig = TRUE, add = TRUE)
# Extract PCoA sample scores (coordinates)
pcoa_site <- pcoa$point
# db-RDA, multiple regression of environmental variables and PCoA axes
# Execute via the RDA function rda() of the vegan package, details ?rda
db_rda <- rda(pcoa_site, env, scale = FALSE)
# summary(db_rda)
# Passive fitting species score
v.eig <- t(otu) %*% db_rda$CCA$u/sqrt(nrow(otu) - 1)
db_rda$CCA$v <- decostand(v.eig, 'normalize', MARGIN = 2)
v.eig <- t(otu) %*% db_rda$CA$u/sqrt(nrow(otu) - 1)
db_rda$CA$v <- decostand(v.eig, 'normalize', MARGIN = 2)
# Extract the data needed for drawing
score = scores(db_rda)
# score$sites
db_rda$CCA$biplot
RDA1 RDA2 RDA3
status 0.76440409 -0.48475784 0.4250838
sex 0.98352818 0.17911294 -0.0243079
age -0.01213581 -0.06874258 -0.9975606
# score$species
CAP1 = score$sites[,1]
CAP2 = score$sites[,2]
seg = as.data.frame(db_rda$CCA$biplot)
CPA_data = as.data.frame(score$sites)
CPA_data$group = rownames(CPA_data)
CPA_data$group = as.character(CPA_data$group)
CPA_data$group = gsub("[0-9]","", CPA_data$group)
# Integrate the above extracted data into the table required for drawing
plotdata = data.frame(rownames(score$sites), CAP1, CAP2, CPA_data$group)
colnames(plotdata) = c('sample','CAP1','CAP2','Group') #为其加上列名(add column name)
# write.csv(plotdata,'data/data5/dbRDA_npc_KO9.csv')
# Calculate the explanation of the first and second principal axes
CAP1_exp = round(db_rda$CCA$eig[1]/sum(db_rda$CCA$eig)*100,2)
CAP2_exp = round(db_rda$CCA$eig[2]/sum(db_rda$CCA$eig)*100,2)
p1 = ggplot(plotdata, aes(CAP1, CAP2)) +
geom_point(aes(fill = Group, color = Group),size = 1.3) +
scale_fill_manual(values = c("#74add1","#a60026"))+
scale_color_manual(values = c("#74add1","#a60026"))+
#add confidence intervals
#stat_ellipse(linetype = 1,level = 0.95,aes(group = Treatment, color = Treatment)) +
#stat_chull(geom = 'polygon', aes(group = Treatment, color = Treatment, fill = Treatment), alpha = 0.1) +
xlab(paste('dbRDA1 ( ',CAP1_exp,'%',' )', sep = '')) +
ylab(paste('dbRDA2 ( ',CAP2_exp,'%',' )', sep = '')) +
geom_segment(data = seg, aes(x = 0, y = 0, xend = seg[,1], yend = seg[,2]),
colour = "red", size = 0.3,
arrow = arrow(angle = 30, length = unit(0.4, 'cm'))) +
geom_text_repel(data = seg, segment.colour = 'black',
aes(x = seg[,1], y = seg[,2],
label = rownames(seg)),size = 3) +
geom_vline(aes(xintercept = 0), linetype = 'dotted') +
geom_hline(aes(yintercept = 0), linetype = 'dotted') +
theme_bw()+
theme(text = element_text(family = 'sans', size = 12),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank()
,legend.position = 'none'
)+
theme(axis.text = element_text(colour = 'black',size = 12))+
stat_ellipse(#data=plotdata,
#geom = "polygon",
level=0.95,
linetype = 2,size=0.7,
aes(color=Group),
alpha=0.8)+
coord_equal(ratio=0.7)
ggsave(paste("results/Other_microbiome_analysis/dbRDA_Species01.pdf",".pdf", sep=""), p1, width=149 * 1.5, height=80 * 1.5, unit='mm')
p1
# Permutation test
env$status = as.numeric(env$status)
env$sex = as.vector(env$sex)
# env$region = as.vector(env$region)
env$age = as.vector(env$age)
envfit <- envfit(db_rda, env, permutations = 999)
r <- as.matrix(envfit$vectors$r)
p <- as.matrix(envfit$vectors$pvals)
env.p <- cbind(r,p)
colnames(env.p) <- c("r2","p-value")
KK <- as.data.frame(env.p)
KK$p.adj = p.adjust(KK$`p-value`, method = 'BH')
# KK
# write.csv(as.data.frame(env.p),file="data/rdaenvfit_HP_new_species_01.csv")
# Use rdacca.hp to calculate the effect of each environmental factor
dis_bray <- as.dist(distance_mat)
bray = dis_bray
cap.hp = rdacca.hp(bray, env, method = 'dbRDA', type = 'R2', scale = FALSE)
cap.hp$Total_explained_variation
[1] 0.204
cap.hp$Hier.part
Unique Average.share Individual I.perc(%)
status 0.0760 -0.0009 0.0751 36.81
sex 0.0788 0.0026 0.0814 39.90
age 0.0539 -0.0063 0.0476 23.33
# Export the explanation of each environmental factor
# write.csv(cap.hp$Hier.part, 'data/env_effect_HP_new_Species_02.csv')
# Unique:每个解释变量单独解释的总变化量的比例,即与其它解释变量不存在共存解释的部分
# Unique: The proportion of the total variation explained by each explanatory variable alone, that is, the part that does not coexist with other explanatory variables
# Average.share: 对每个解释变量与其它解释变量共同解释部分的分割
# Average.share: Split each explanatory variable into its common explanation with other explanatory variables
# Individual: 每个解释变量站总变化量的比例Individual等于Unique和Average.share的总和
# Individual: The proportion of the total variation of each explanatory variable station is equal to the sum of Unique and Average.share
# I.prec(%): 每个解释变量占总被解释变化量的比例,即各解释变量的Individual占Individual总和的百分比,Individual总和等同db-RDA的总解释方差占比
# I.prec(%): The proportion of each explanatory variable to the total explained variance, that is, the percentage of each explanatory variable's individual to the individual sum, and the individual sum is equivalent to the total explained variance proportion of db-RDA
library(MMUPHin) # A software package for meta-analysis of microbiome data
library(magrittr) # Pipeline operators in R language
library(dplyr) # Data processing software package
library(ggplot2) # Drawing software package
library(vegan) # Data analysis software package
# Remove batch effects
# Load metadata
meta.all <- read.csv(file = 'data/group_batch.csv', stringsAsFactors = FALSE, header = TRUE, row.names = 1, check.name = FALSE)
# rownames(meta.all) <- meta.all$Run
meta.all$StudyID <- factor(meta.all$StudyID)
# Import relative abundance data of bacterial species
feat.abu <- read.table(file = "data/species_batch.txt", sep = "\t", header = TRUE, check.names = FALSE)
# Sum of Species
feat.abu <- aggregate(. ~ Species, data = feat.abu, sum)
rownames(feat.abu) <- feat.abu$Species
feat.abu <- feat.abu[, -1]
# Ensure feat.abu columns match meta.all rownames
feat.abu <- feat.abu[, rownames(meta.all)]
# Replace NA values with 0 and scale the data
feat.abu[is.na(feat.abu)] <- 0
feat.abu <- feat.abu / 100
# Zero-inflated empirical Bayes adjustment of batch effect in compositional feature abundance data
fit_adjust_batch <- adjust_batch(
feature_abd = feat.abu,
batch = "StudyID",
covariates = c("Group", "Gender", "Age"),
data = meta.all,
control = list(verbose = FALSE)
)
# Adjusted feature abundance data
npc_abd_adj <- fit_adjust_batch$feature_abd_adj
npc_abd_adj <- npc_abd_adj * 100
# Save adjusted data to CSV
write.csv(npc_abd_adj, 'results/Other_microbiome_analysis/Species_ra_adj_age_new100.csv')
# Batch Effect Diagram
# Load packages
library(vegan, quietly = TRUE)
library(ggplot2)
library(dplyr)
library(tidyverse)
library(ggthemes)
library(ggpubr)
library(vegan)
library(UpSetR)
library(ggsci)
library(doBy)
# Dissimilarity Indices for Community Ecologists
D_before <- vegdist(t(feat.abu)) # Bray-Curtis distance as default
D_after <- vegdist(t(npc_abd_adj))
# Permutational Multivariate Analysis of Variance Using Distance Matrices
set.seed(1)
fit_adonis_before <- adonis2(D_before ~ StudyID, data = meta.all, permutations = 999, method="bray", binary=F)
# PCoA before batch correction
feat.abu2 <- as.data.frame(t(feat.abu))
dune_dist_before <- vegdist(feat.abu2, method="bray", binary=F)
dune_pcoa_before <- cmdscale(D_before, k= (nrow(feat.abu2) - 1), eig = TRUE, add = TRUE)
dune_pcoa_points_before <- as.data.frame(dune_pcoa_before$points)
sum_eig_before <- sum(dune_pcoa_before$eig)
eig_percent_before <- round(dune_pcoa_before$eig / sum_eig_before * 100, 1)
colnames(dune_pcoa_points_before) <- paste0("PCoA", 1:3)
dune_pcoa_result_before <- cbind(dune_pcoa_points_before, meta.all)
# Plot PCoA before batch correction
conflicts_prefer(ggplot2::theme_minimal)
p1 <- ggplot(dune_pcoa_result_before, aes(x=PCoA1, y=PCoA2, color = Group, shape = StudyID)) +
geom_point(size=2) +
scale_shape_manual(values = c(15, 10, 17, 18, 19, 8, 25, 3)) +
labs(x=paste("PCoA 1 (", eig_percent_before[1], "%)", sep=""),
y=paste("PCoA 2 (", eig_percent_before[2], "%)", sep="")) +
theme_classic()
plot1 <- ggscatter(dune_pcoa_result_before, x= "PCoA1", y = "PCoA2", color="StudyID",
mean.point = TRUE, star.plot = TRUE, ggtheme = theme_minimal()) +
labs(x=paste("PCo 1 (", eig_percent_before[1], "%)", sep=""),
y=paste("PCo 2 (", eig_percent_before[2], "%)", sep="")) +
theme_classic() +
geom_vline(xintercept = 0, color = 'gray', size = 0.4) +
geom_hline(yintercept = 0, color = 'gray', size = 0.4) +
theme(panel.grid = element_line(color = 'black', linetype = 2, size = 0.1),
panel.background = element_rect(color = 'black', fill = 'transparent'),
legend.title = element_blank()) +
theme(axis.title = element_text(size = 18, colour="black"),
axis.text = element_text(size = 16, colour = "black"),
legend.text = element_text(size = 16))
# Print the results of permutation multiple regression analysis of variance after batch correction
set.seed(1)
fit_adonis_after <- adonis2(D_after ~ StudyID, data = meta.all, permutations = 999, method="bray")
# PCoA after batch correction
npc_abd_adj2 <- as.data.frame(t(npc_abd_adj))
dune_dist_after <- vegdist(npc_abd_adj2, method="bray", binary=F)
dune_pcoa_after <- cmdscale(D_after, k= (nrow(npc_abd_adj2) - 1), eig = TRUE, add = TRUE)
dune_pcoa_points_after <- as.data.frame(dune_pcoa_after$points)
sum_eig_after <- sum(dune_pcoa_after$eig)
eig_percent_after <- round(dune_pcoa_after$eig / sum_eig_after * 100, 1)
colnames(dune_pcoa_points_after) <- paste0("PCoA", 1:3)
dune_pcoa_result_after <- cbind(dune_pcoa_points_after, meta.all)
# Plot PCoA after batch correction
p2 <- ggplot(dune_pcoa_result_after, aes(x=PCoA1, y=PCoA2, color = Group, shape = StudyID)) +
geom_point(size=2) +
scale_shape_manual(values = c(15, 10, 17, 18, 19, 8, 25, 3)) +
labs(x=paste("PCo 1 (", eig_percent_after[1], "%)", sep=""),
y=paste("PCo 2 (", eig_percent_after[2], "%)", sep="")) +
theme_classic() +
stat_ellipse(level=0.95, linetype = 2, size=0.7, aes(color=Group), alpha=0.8)
plot2 <- ggscatter(dune_pcoa_result_after, x= "PCoA1", y = "PCoA2", color="StudyID",
mean.point = TRUE, star.plot = TRUE, ggtheme = theme_minimal()) +
labs(x=paste("PCoA 1 (", eig_percent_after[1], "%)", sep=""),
y=paste("PCoA 2 (", eig_percent_after[2], "%)", sep="")) +
theme_classic() +
geom_vline(xintercept = 0, color = 'gray', size = 0.4) +
geom_hline(yintercept = 0, color = 'gray', size = 0.4) +
theme(panel.grid = element_line(color = 'black', linetype = 2, size = 0.1),
panel.background = element_rect(color = 'black', fill = 'transparent'),
legend.title = element_blank()) +
theme(axis.title = element_text(size = 18, colour="black"),
axis.text = element_text(size = 16, colour = "black"),
legend.text = element_text(size = 16))
# Combine plots
library(patchwork)
p_all <- p1 + p2
ggsave(p_all, file = "results/Other_microbiome_analysis/Compare01.pdf", width = 289, height = 150, unit = 'mm')
p_all2 <- plot1 + plot2
ggsave(p_all2, file = "results/Other_microbiome_analysis/Compare02.pdf", width = 289, height = 100, unit = 'mm')
#Halla Analysis Implementation
#Implementation of HALLA module through online websites http://galaxy.biobakery.org/
# Inputs
# HAllA requires two tab-delimited text files representing two paired datasets (sets of features) describing the same set of samples.
# The example data inputs can be found in the "example_data" folder or http://galaxy.biobakery.org/
# Outputs
# A pdf - the "halagram.pdf"
# A zip file containing all the files in the generated output directory
# Load packages
library(pheatmap)
library(tidyr)
library(vegan)
library(ComplexHeatmap)
# Graphic beautification and drawing of heat maps
data <- read.table("data/all_associations.txt", header=TRUE, sep="\t")
# Convert to wide data
data2 <- pivot_wider(data, names_from = "Y_features", values_from = "association")
data2 <- as.data.frame(data2)
rownames(data2) <- data2$X_features
data2 <- data2[, -1]
# Draw a heat map
p <- Heatmap(data2,
name = "Association",
row_names_gp = gpar(fontsize = 8),
cluster_rows = TRUE, cluster_columns = TRUE,
col = colorRampPalette(colors = c("#87CEFF", "white", "#FF7256"))(50)
)
# Save plot
pdf("results/Other_microbiome_analysis/halla_association01.pdf", width = 10, height = 8)
draw(p)
dev.off()
png
2
# Draw a heat map with p-values
data_km_q <- read.table("data/all_q_value.txt", header=TRUE, sep="\t")
data_km_q2 <- pivot_wider(data_km_q, names_from = "Y_features", values_from = "q.values")
data_km_q2 <- as.data.frame(data_km_q2)
rownames(data_km_q2) <- data_km_q2$X_features
data_km_q2 <- data_km_q2[, -1]
p2 <- Heatmap(data2,
name = "Association",
cell_fun = function(j, i, x, y, width, height, fill) {
if (data_km_q2[i, j] < 0.05) {
grid.text(sprintf("%.3f", data_km_q2[i, j]), x, y, gp = gpar(fontsize = 6))
} else {
grid.text("", x, y)
}
}
)
# Save plot
pdf("results/Other_microbiome_analysis/halla_association2.pdf", width = 10, height = 8)
draw(p2)
dev.off()
png
2
# Load packages and data
library(vegan)
library(ggplot2)
data(varespec)
data(varechem)
# Calculate the distance matrix between samples
species_dist <- vegdist(varespec, method = "bray")
env_dist <- vegdist(scale(varechem), method = "euclidean")
# Dimensionality reduction using NMDS
nmds_species <- monoMDS(species_dist)
nmds_env <- monoMDS(env_dist)
# Procrustes Analysis
proc_result <- procrustes(nmds_species, nmds_env, symmetric = TRUE)
summary(proc_result)
Call:
procrustes(X = nmds_species, Y = nmds_env, symmetric = TRUE)
Number of objects: 24 Number of dimensions: 2
Procrustes sum of squares:
0.6515295
Procrustes root mean squared error:
0.1647637
Quantiles of Procrustes errors:
Min 1Q Median 3Q Max
0.02987198 0.10213559 0.14600278 0.17560233 0.34398810
Rotation matrix:
[,1] [,2]
[1,] 0.7730178 0.6343843
[2,] -0.6343843 0.7730178
Translation of averages:
[,1] [,2]
[1,] -1.681189e-18 2.334995e-18
Scaling of target:
[1] 0.5903139
# Significance test
set.seed(1)
proc_test <- protest(nmds_species, nmds_env, permutations = 999)
proc_test
Call:
protest(X = nmds_species, Y = nmds_env, permutations = 999)
Procrustes Sum of Squares (m12 squared): 0.6515
Correlation in a symmetric Procrustes rotation: 0.5903
Significance: 0.001
Permutation: free
Number of permutations: 999
# Extract coordinate data
procrustes_data <- as.data.frame(cbind(proc_result$Yrot, proc_result$X))
rotation_matrix <- as.data.frame(proc_result$rotation)
# Plot
ggplot(procrustes_data) +
geom_segment(aes(x = V1, y = V2, xend = MDS1, yend = MDS2),
color = "#66C2A5", size = 1, arrow = arrow(length = unit(0.25, 'cm'))) +
geom_point(aes(V1, V2), color = "#FC8D62", size = 4, shape = 16) +
geom_point(aes(MDS1, MDS2), color = "#8DA0CB", size = 4, shape = 16) +
theme_classic(base_size = 16) +
theme(
panel.border = element_rect(color = "black", fill = NA, size = 1),
panel.grid = element_blank(),
panel.background = element_rect(fill = "transparent", color = NA),
axis.ticks.length = unit(0.4, "lines"),
axis.line = element_blank(),
axis.title = element_text(size = 14),
axis.text = element_text(size = 12),
plot.title = element_text(hjust = 0.5, size = 18, face = "bold")
) +
labs(x = "Dimension 1", y = "Dimension 2",
title = "Procrustes Analysis: Community vs Environment") +
geom_vline(xintercept = 0, color = "grey60", linetype = "dashed", size = 0.5) +
geom_hline(yintercept = 0, color = "grey60", linetype = "dashed", size = 0.5) +
geom_abline(slope = 1, intercept = 0, color = "black", size = 0.8) + # 第一条对角线,斜率为1
geom_abline(slope = -1, intercept = 0, color = "black", size = 0.8) + # 第二条对角线,斜率为-1
annotate("text", label = paste("M2 =", round(proc_test$ss, 4),
"\nP-value =", format.pval(proc_test$signif)),
x = 0.2, y = -0.15, size = 4, hjust = 0)
# Save as PPT
library(export)
graph2ppt(file = "results/Other_microbiome_analysis/Procrustes_Analysis_with_Crosslines.ppt", width = 5, height = 5)
# Load packages
#install.packages("mediation")
library(ggplot2)
library(dplyr)
library(mediation)
# Simulate a mediation effect, set the random seed and generate simulated data
set.seed(2344)
df <- iris %>%
mutate(
random_factor1 = runif(n(), min = min(Sepal.Length), max = max(Sepal.Length)),
mediator_var = 0.4 * Sepal.Length + 0.6 * random_factor1, # Simulating mediating variables
random_factor2 = runif(n(), min = min(mediator_var), max = max(mediator_var)),
dependent_var = 0.4 * mediator_var + 0.6 * random_factor2 # Simulating dependent variables
)
# Statistical analysis
# 1. Calculate the total effect (the total impact of the independent variable on the dependent variable)
total_effect_model <- lm(dependent_var ~ Sepal.Length, data = df)
summary(total_effect_model)
Call:
lm(formula = dependent_var ~ Sepal.Length, data = df)
Residuals:
Min 1Q Median 3Q Max
-1.28365 -0.39582 0.03309 0.41165 1.20451
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 5.19594 0.32851 15.817 <2e-16 ***
Sepal.Length 0.14481 0.05567 2.601 0.0102 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.5627 on 148 degrees of freedom
Multiple R-squared: 0.04372, Adjusted R-squared: 0.03726
F-statistic: 6.767 on 1 and 148 DF, p-value: 0.01023
# 2. Calculate the impact of the independent variable on the mediating variable
mediator_model <- lm(mediator_var ~ Sepal.Length, data = df)
summary(mediator_model)
Call:
lm(formula = mediator_var ~ Sepal.Length, data = df)
Residuals:
Min 1Q Median 3Q Max
-1.1320 -0.4904 0.0604 0.4698 0.9850
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 3.6625 0.3417 10.719 < 2e-16 ***
Sepal.Length 0.4118 0.0579 7.112 4.53e-11 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.5852 on 148 degrees of freedom
Multiple R-squared: 0.2547, Adjusted R-squared: 0.2497
F-statistic: 50.58 on 1 and 148 DF, p-value: 4.534e-11
# 3. Calculate the impact of the mediating variable on the dependent variable while controlling the independent variable
mediated_effect_model <- lm(dependent_var ~ Sepal.Length + mediator_var, data = df)
summary(mediated_effect_model)
Call:
lm(formula = dependent_var ~ Sepal.Length + mediator_var, data = df)
Residuals:
Min 1Q Median 3Q Max
-0.84899 -0.37869 -0.01874 0.34800 0.98188
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 3.53127 0.38714 9.122 5.20e-16 ***
Sepal.Length -0.04235 0.05701 -0.743 0.459
mediator_var 0.45452 0.06988 6.504 1.15e-09 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.4975 on 147 degrees of freedom
Multiple R-squared: 0.2574, Adjusted R-squared: 0.2473
F-statistic: 25.48 on 2 and 147 DF, p-value: 3.158e-10
# 4. Causal Mediation Analysis
conflicts_prefer(mediation::mediate)
causal_mediation_results <- mediate(
model.m = mediator_model,
model.y = mediated_effect_model,
treat = "Sepal.Length",
mediator = "mediator_var",
boot = TRUE
)
summary(causal_mediation_results)
Causal Mediation Analysis
Nonparametric Bootstrap Confidence Intervals with the Percentile Method
Estimate 95% CI Lower 95% CI Upper p-value
ACME 0.1872 0.1214 0.26 <2e-16 ***
ADE -0.0424 -0.1518 0.06 0.43
Total Effect 0.1448 0.0435 0.25 <2e-16 ***
Prop. Mediated 1.2925 0.7277 4.14 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Sample Size Used: 150
Simulations: 1000
# Plot
plot_mediation <- function(med_result) {
plot(med_result)+
theme_minimal() +
theme(
plot.title = element_text(face = "bold", size = 14),
axis.title = element_text(size = 12, face = "bold"),
axis.text = element_text(size = 10),
) +
labs(
x = "Effect Size",
y = "Variable"
)
}
# Plot
plot_mediation <- function(med_result) {
plot(med_result, main = "Mediation Analysis Results")
}
# Plot
plot_mediation(causal_mediation_results)
# Save
pdf("results/Other_microbiome_analysis/Mediation_Analysis_Plot.pdf", width = 8, height = 6)
plot_mediation(causal_mediation_results)
dev.off()
png
2
# BiocManager::install("mixOmics")
# install.packages("C:/Program Files/R/R-4.3.1/library/mixOmics_6.28.0.tar.gz", repos = NULL, type="source")
library(mixOmics)
library(RColorBrewer)
conflicts_prefer(mixOmics::pca)
conflicts_prefer(mixOmics::splsda)
# PCA analysis, mainly combining multiple independent variables in a linear manner for dimensionality reduction analysis
data(liver.toxicity)
X <- liver.toxicity$gene
pca_result <- pca(X)
# Set color
custom_colors <- brewer.pal(4, "Set1")
plotIndiv(pca_result, group = liver.toxicity$treatment$Dose.Group,
ind.names = FALSE, legend = TRUE,
col = custom_colors, title = 'Liver toxicity: PCA comp 1 - 2',
legend.title = 'Dose', size.title = rel(1.2),
legend.title.pch = 'Exposure')
# Set theme
theme_set(theme_classic(base_size = 14) +
theme(legend.position = "bottom",
axis.title.x = element_text(size = 16),
axis.title.y = element_text(size = 16)))
plot(pca_result)
p1<- plotLoadings(pca_result, ndisplay = 100, comp = 2,
name.var = liver.toxicity$gene.ID[, "geneBank"],
col = custom_colors[2], size.name = rel(0.5))
#p1
#2. Sparse matrix PCA analysis, this function has an additional keepX parameter compared to PCA, which can be set to the top few genes or samples that work between each group
MyResult.spca <- spca(X, ncomp = 3, keepX = c(15, 10, 5))
# Make sure you assign a color to each group
n_groups <- length(unique(liver.toxicity$treatment$Dose.Group))
col_palette <- brewer.pal(n_groups, "Set1")
# Plot
plotIndiv(MyResult.spca,
group = liver.toxicity$treatment$Dose.Group,
pch = as.factor(liver.toxicity$treatment$Time.Group),
col = col_palette,
legend = TRUE,
title = 'Liver Toxicity: Sparse PCA - Comp 1 vs Comp 2')
#This graph is a polar coordinate graph, mainly reflecting the correlation distribution between variables.
#In PCA, you can also directly look at the explained variance score (which mainly reflects the contribution of the independent variable to the dependent variable, ranging from 0 to 1, with larger values indicating greater contribution), which requires the use of the function tune. pca (X)
# 3.PLS-DA analysis, although partial least squares method was not initially applied to classification and discrimination problems. Later, after renovation, it was still used for classification research.
data(srbct)
X <- srbct$gene
Y <- srbct$class
# sPLS-DA analysis
MyResult.splsda <- splsda(X, Y, keepX = c(50, 50))
# Set color
n_groups <- length(unique(Y))
col_palette <- brewer.pal(n_groups, "Set1") # 根据组的数量选择调色板
# Plot
plotIndiv(MyResult.splsda,
col = col_palette,
legend = TRUE,
title = 'sPLS-DA: SRBCT Data')
#4. PLS analysis, partial least squares regression is a multivariate method that designs two data matrices X and Y. By modeling the structure of two matrices, PLS goes beyond traditional multiple regression. Unlike traditional multiple regression models, it is not limited to unrelated variables. One of the advantages of PLS is that it can handle many noisy, collinear, and missing variables, and can also model several response variables simultaneously in Y.
# Load data
data(nutrimouse)
X <- nutrimouse$gene
Y <- nutrimouse$lipid
# sPLS analysis
MyResult.spls <- spls(X, Y, keepX = c(25, 25), keepY = c(5, 5))
#MyResult.spls$names
# Set color
n_groups <- length(unique(nutrimouse$genotype))
print(n_groups) # 2
[1] 2
# Set color
col_palette <- c("blue", "red")
# Plot
plotIndiv(MyResult.spls,
group = nutrimouse$genotype,
rep.space = "XY-variate",
legend = TRUE,
col = col_palette,
ind.names = nutrimouse$diet,
title = 'Nutrimouse: sPLS')
p2<- plotLoadings(MyResult.spls)
#p2
# 5. DIABLO analysis, equivalent to an extension of PLS, can introduce multiple matrices in X.
# Load data
data(breast.TCGA)
X <- list(mRNA = breast.TCGA$data.train$mrna,
miRNA = breast.TCGA$data.train$mirna,
protein = breast.TCGA$data.train$protein)
Y <- breast.TCGA$data.train$subtype
list.keepX <- list(mRNA = c(16, 17), miRNA = c(18, 5), protein = c(5, 5))
MyResult.diablo <- block.splsda(X, Y, keepX = list.keepX)
# Set color
n_groups <- length(unique(Y))
col_palette_indiv <- brewer.pal(n_groups, "Set1")
col_palette_var <- brewer.pal(n_groups, "Paired")
# Plot
plotIndiv(MyResult.diablo,
ind.names = FALSE,
legend = TRUE,
cex = c(1, 2, 3),
title = 'BRCA with DIABLO',
col = col_palette_indiv)
# Plot
plotVar(MyResult.diablo,
var.names = c(FALSE, FALSE, TRUE),
legend = TRUE,
pch = c(16, 16, 1),
col = col_palette_var)
# DIABLO
plotDiablo(MyResult.diablo, ncomp = 1)
# Circos
circosPlot(MyResult.diablo, cutoff = 0.7)
# ROC curve
Myauc.diablo <- auroc(MyResult.diablo, roc.block = "miRNA", roc.comp = 2)
$mRNA
$mRNA$comp1
AUC p-value
Basal vs Other(s) 0.9970 0.00000
Her2 vs Other(s) 0.6106 0.06149
LumA vs Other(s) 0.9883 0.00000
$mRNA$comp2
AUC p-value
Basal vs Other(s) 0.9992 0.000e+00
Her2 vs Other(s) 0.9703 1.776e-15
LumA vs Other(s) 0.9970 0.000e+00
$miRNA
$miRNA$comp1
AUC p-value
Basal vs Other(s) 0.9551 0.0000
Her2 vs Other(s) 0.5650 0.2716
LumA vs Other(s) 0.9239 0.0000
$miRNA$comp2
AUC p-value
Basal vs Other(s) 0.9623 0.00e+00
Her2 vs Other(s) 0.8650 6.67e-10
LumA vs Other(s) 0.9589 0.00e+00
$protein
$protein$comp1
AUC p-value
Basal vs Other(s) 0.9524 0.000000
Her2 vs Other(s) 0.6678 0.004542
LumA vs Other(s) 0.9874 0.000000
$protein$comp2
AUC p-value
Basal vs Other(s) 0.9790 0.000e+00
Her2 vs Other(s) 0.9256 6.111e-13
LumA vs Other(s) 0.9950 0.000e+00
#You can also use the caret package in R software to achieve
# Load packages
library(geepack)
library(tidyverse)
# Load data
data("dietox")
# Data processiing
dietox <- dietox %>%
mutate(Cu = factor(Cu),
Evit = factor(Evit))
# Define fomula
model_formula <- Weight ~ Time + Evit + Cu
# Independence correlation structure
gee_independent <- geeglm(model_formula, id = Pig, data = dietox,
family = gaussian, corstr = "ind")
summary(gee_independent)
Call:
geeglm(formula = model_formula, family = gaussian, data = dietox,
id = Pig, corstr = "ind")
Coefficients:
Estimate Std.err Wald Pr(>|W|)
(Intercept) 15.07283 1.42190 112.371 <2e-16 ***
Time 6.94829 0.07979 7582.549 <2e-16 ***
EvitEvit100 2.08126 1.84178 1.277 0.258
EvitEvit200 -1.11327 1.84830 0.363 0.547
CuCu035 -0.78865 1.53486 0.264 0.607
CuCu175 1.77672 1.82134 0.952 0.329
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Correlation structure = independence
Estimated Scale Parameters:
Estimate Std.err
(Intercept) 48.28 9.309
Number of clusters: 72 Maximum cluster size: 12
anova(gee_independent)
Analysis of 'Wald statistic' Table
Model: gaussian, link: identity
Response: Weight
Terms added sequentially (first to last)
Df X2 P(>|Chi|)
Time 1 7507 <2e-16 ***
Evit 2 4 0.15
Cu 2 2 0.41
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Interpretation of results: Time variable significantly affects weight
# Exchangeable correlation structure
gee_exchangeable <- geeglm(model_formula, id = Pig, data = dietox,
family = gaussian, corstr = "ex")
summary(gee_exchangeable)
Call:
geeglm(formula = model_formula, family = gaussian, data = dietox,
id = Pig, corstr = "ex")
Coefficients:
Estimate Std.err Wald Pr(>|W|)
(Intercept) 15.0984 1.4206 112.96 <2e-16 ***
Time 6.9426 0.0796 7605.79 <2e-16 ***
EvitEvit100 2.0414 1.8431 1.23 0.27
EvitEvit200 -1.1103 1.8452 0.36 0.55
CuCu035 -0.7652 1.5354 0.25 0.62
CuCu175 1.7871 1.8189 0.97 0.33
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Correlation structure = exchangeable
Estimated Scale Parameters:
Estimate Std.err
(Intercept) 48.3 9.31
Link = identity
Estimated Correlation Parameters:
Estimate Std.err
alpha 0.766 0.0326
Number of clusters: 72 Maximum cluster size: 12
anova(gee_exchangeable)
Analysis of 'Wald statistic' Table
Model: gaussian, link: identity
Response: Weight
Terms added sequentially (first to last)
Df X2 P(>|Chi|)
Time 1 7604 <2e-16 ***
Evit 2 4 0.16
Cu 2 2 0.41
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Interpretation of results: Time variable is still the only significant influencing factor
# Autoregressive correlation structure
gee_ar1 <- geeglm(model_formula, id = Pig, data = dietox,
family = gaussian, corstr = "ar1")
summary(gee_ar1)
Call:
geeglm(formula = model_formula, family = gaussian, data = dietox,
id = Pig, corstr = "ar1")
Coefficients:
Estimate Std.err Wald Pr(>|W|)
(Intercept) 17.6124 1.3354 173.95 <2e-16 ***
Time 6.7324 0.0756 7921.11 <2e-16 ***
EvitEvit100 2.3782 1.7676 1.81 0.18
EvitEvit200 -0.9779 1.7369 0.32 0.57
CuCu035 -0.3976 1.3928 0.08 0.78
CuCu175 1.2376 1.7376 0.51 0.48
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Correlation structure = ar1
Estimated Scale Parameters:
Estimate Std.err
(Intercept) 50.5 9.41
Link = identity
Estimated Correlation Parameters:
Estimate Std.err
alpha 0.933 0.0116
Number of clusters: 72 Maximum cluster size: 12
anova(gee_ar1)
Analysis of 'Wald statistic' Table
Model: gaussian, link: identity
Response: Weight
Terms added sequentially (first to last)
Df X2 P(>|Chi|)
Time 1 7907 <2e-16 ***
Evit 2 5 0.07 .
Cu 2 1 0.65
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Interpretation of the results: The Time variable continues to remain significant, and the coefficient and standard error have almost remained unchanged.
# Unstructured correlation structure
gee_unstructured <- geeglm(model_formula, id = Pig, data = dietox,
family = gaussian, corstr = "unstructured")
summary(gee_unstructured)
Call:
geeglm(formula = model_formula, family = gaussian, data = dietox,
id = Pig, corstr = "unstructured")
Coefficients:
Estimate Std.err Wald Pr(>|W|)
(Intercept) 14.574 1.720 71.79 <2e-16 ***
Time 7.544 0.131 3329.24 <2e-16 ***
EvitEvit100 -2.312 2.218 1.09 0.30
EvitEvit200 -1.790 1.952 0.84 0.36
CuCu035 1.082 2.098 0.27 0.61
CuCu175 2.424 2.264 1.15 0.28
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Correlation structure = unstructured
Estimated Scale Parameters:
Estimate Std.err
(Intercept) 63.3 12.5
Link = identity
Estimated Correlation Parameters:
Estimate Std.err
alpha.1:2 0.2950 0.0435
alpha.1:3 0.2586 0.0498
alpha.1:4 0.1974 0.0670
alpha.1:5 0.1625 0.0771
alpha.1:6 0.0811 0.0915
alpha.1:7 0.1141 0.1029
alpha.1:8 0.1293 0.1050
alpha.1:9 0.1126 0.1197
alpha.1:10 0.1022 0.1400
alpha.1:11 0.1720 0.1479
alpha.1:12 0.0874 0.1780
alpha.2:3 0.3831 0.0572
alpha.2:4 0.3906 0.0621
alpha.2:5 0.4058 0.0631
alpha.2:6 0.3787 0.0625
alpha.2:7 0.4091 0.0659
alpha.2:8 0.4155 0.0664
alpha.2:9 0.4100 0.0737
alpha.2:10 0.3950 0.0865
alpha.2:11 0.4403 0.0966
alpha.2:12 0.4133 0.1235
alpha.3:4 0.5304 0.0701
alpha.3:5 0.5721 0.0663
alpha.3:6 0.5694 0.0606
alpha.3:7 0.5861 0.0644
alpha.3:8 0.5884 0.0651
alpha.3:9 0.6040 0.0700
alpha.3:10 0.5897 0.0811
alpha.3:11 0.6292 0.0890
alpha.3:12 0.6416 0.1100
alpha.4:5 0.7128 0.0537
alpha.4:6 0.7500 0.0464
alpha.4:7 0.7617 0.0430
alpha.4:8 0.7673 0.0433
alpha.4:9 0.7910 0.0482
alpha.4:10 0.7828 0.0579
alpha.4:11 0.8195 0.0686
alpha.4:12 0.8598 0.0887
alpha.5:6 0.9187 0.0571
alpha.5:7 0.9272 0.0512
alpha.5:8 0.9260 0.0443
alpha.5:9 0.9625 0.0420
alpha.5:10 0.9306 0.0496
alpha.5:11 0.9788 0.0532
alpha.5:12 1.0413 0.0684
alpha.6:7 1.0427 0.0712
alpha.6:8 1.0405 0.0622
alpha.6:9 1.0937 0.0637
alpha.6:10 1.0680 0.0543
alpha.6:11 1.1005 0.0585
alpha.6:12 1.2003 0.0706
alpha.7:8 1.1054 0.0619
alpha.7:9 1.1533 0.0552
alpha.7:10 1.1307 0.0492
alpha.7:11 1.1735 0.0487
alpha.7:12 1.2591 0.0762
alpha.8:9 1.1819 0.0533
alpha.8:10 1.1529 0.0504
alpha.8:11 1.1942 0.0510
alpha.8:12 1.2917 0.0841
alpha.9:10 1.2467 0.0510
alpha.9:11 1.3051 0.0558
alpha.9:12 1.4178 0.0882
alpha.10:11 1.3561 0.0627
alpha.10:12 1.4596 0.1023
alpha.11:12 1.5600 0.1103
Number of clusters: 72 Maximum cluster size: 12
anova(gee_unstructured)
Analysis of 'Wald statistic' Table
Model: gaussian, link: identity
Response: Weight
Terms added sequentially (first to last)
Df X2 P(>|Chi|)
Time 1 20.21 6.9e-06 ***
Evit 2 7.01 0.03 *
Cu 2 1.15 0.56
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Result interpretation: The Evit variable shows a certain influence under this correlation structure, and the coefficient and standard error have changed
# Model summary
model_summaries <- list(
"Independence" = summary(gee_independent),
"Exchangeable" = summary(gee_exchangeable),
"AR(1)" = summary(gee_ar1),
"Unstructured" = summary(gee_unstructured)
)
# It can be seen from the model results that the Time variable shows a significant impact under different correlation structure assumptions.
# The Evit variable only shows significance under the unstructured correlation assumption, which indicates that different correlation structure choices may have a certain impact on the model results.
# Load packages
library(lavaan)
library(semPlot)
# Load data
mydata <- read.csv("data/meta.csv", row.names = 1)
mydata$status_dummy <- ifelse(mydata$status == 2, 1, 0)
# Get the numeric variables and exclude the status and status_dummy columns
numeric_vars <- names(mydata)[sapply(mydata, is.numeric)]
numeric_vars_to_scale <- numeric_vars[!(numeric_vars %in% c("status", "status_dummy"))]
# Standardize numerical variables
mydata_scaled <- mydata
mydata_scaled[, numeric_vars_to_scale] <- scale(mydata[, numeric_vars_to_scale])
# Define SEM model
model <- '
latentBact =~ Bacteria_richness
latentFung =~ Fungi_richness
latentFAPROTAX =~ FAPROTAX
latentFunGuild =~ FunGuild
latentBact ~ OM + status_dummy
latentFung ~ OM + status_dummy
latentFAPROTAX ~ OM + status_dummy
latentFunGuild ~ OM + status_dummy
latentBact ~~ latentFung
'
# Fitting the model
fit <- sem(model, data = mydata_scaled, int.ov.free = TRUE)
# Save model summary, coefficients, and ANOVA results to file
capture.output(
summary(fit),
coef(fit),
anova(fit),
file = "results/Other_microbiome_analysis/model01_stats.txt"
)
# Draw a structural equation model diagram and optimize the layout and color matching
sem_plot <- semPaths(
fit,
what = "est", # Display path coefficients
edge.label.cex = 1.0, # Increase edge label font size
layout = "tree2", # Use a clearer tree layout
style = "lisrel", # Use Lisrel styles to simplify graphics
color = list(lat = "#76C7C0", man = "#F7C04A", edge = "gray50"), # Set color
sizeMan = 8, # Set the node size of the observation variable
sizeLat = 10, # Setting the node size of latent variables
residuals = FALSE, # Do not display residuals
mar = c(6, 6, 6, 6) # Set margins to increase the sense of space
)
# Save plot
pdf("results/Other_microbiome_analysis/SEM_Plot01.pdf", width = 10, height = 8)
semPaths(fit, what = "est", edge.label.cex = 1.0, layout = "tree2",
style = "lisrel", color = list(lat = "#76C7C0", man = "#F7C04A", edge = "gray50"),
sizeMan = 8, sizeLat = 10, residuals = FALSE, mar = c(6, 6, 6, 6))
dev.off()
png
2
# Save plot
png("results/Other_microbiome_analysis/SEM_Plot01.png", width = 1200, height = 960, res = 150)
semPaths(fit, what = "est", edge.label.cex = 1.0, layout = "tree2",
style = "lisrel", color = list(lat = "#76C7C0", man = "#F7C04A", edge = "gray50"),
sizeMan = 8, sizeLat = 10, residuals = FALSE, mar = c(6, 6, 6, 6))
dev.off()
png
2
# Load packages
library(survival)
library(dplyr)
library(autoReg)
library(rrtable)
library(survminer)
library(jstable)
library(patchwork)
# Load data
mydata <- survival::lung %>% na.omit()
mydata <- mydata %>% mutate(sex = factor(sex), ph.ecog = factor(ph.ecog))
# Descriptive Statistics: Generate a table of baseline characteristics
table1 <- gaze(status ~ ., data = mydata) %>% myft()
table2 <- gaze(status ~ ., data = mydata, method = 3) %>% myft()
# Fitting survival curves
fit <- survfit(Surv(time, status) ~ sex, data = mydata)
# Survival curves
pdf("results/Other_microbiome_analysis/survival_plot.pdf", width = 10, height = 8)
surv_plot <- ggsurvplot(
fit,
data = mydata,
conf.int = TRUE,
pval = TRUE,
pval.size = 4.5,
surv.median.line = "hv",
risk.table = TRUE,
risk.table.height = 0.2,
risk.table.col = "strata",
xlab = "Follow-up time (days)",
legend.title = "Sex",
legend.labs = c("Male", "Female"),
palette = c("#E64B35FF", "#4DBBD5FF"),
break.x.by = 100,
ggtheme = theme_classic() +
theme(
plot.title = element_text(size = 14, face = "bold"),
axis.title = element_text(size = 12),
axis.text = element_text(size = 10)
)
)
surv_plot
dev.off()
png
2
# Use patchwork to combine the main survival curve plot with additional components such as risk tables
# complete_plot <- surv_plot$plot / surv_plot$table
# Save plot
# ggsave("survival_plot.pdf",
# #plot = complete_plot,
# plot = surv_plot$plot,
# device = "pdf",
# width = 8, height = 10)
# Cox regression model fitting and presentation
cox_fit <- coxph(Surv(time, status) ~ age + sex + ph.ecog + ph.karno + pat.karno + meal.cal + wt.loss,
data = mydata)
cox_result <- autoReg(cox_fit, uni = TRUE) %>% myft()
# Export Cox regression results as PPTX
table2pptx(cox_result)
# Forest plot of Cox model results
forest_plot <- ggforest(
cox_fit,
data = mydata,
main = "Hazard Ratios (95% CI)",
fontsize = 0.9,
cpositions = c(0.02, -0.1, 0.3),
refLabel = "Reference",
noDigits = 2
)
forest_plot
# Save plot
ggsave("results/Other_microbiome_analysis/cox_forest_plot.pdf", plot = forest_plot, device = "pdf", width = 9, height = 6)
# Variable truncation and grouping
mydata$status <- mydata$status - 1
mydata$sex <- ifelse(mydata$sex == 1, "Male", "Female")
cutpoints <- surv_cutpoint(mydata, time = "time", event = "status",
variables = c("age", "meal.cal", "ph.karno", "pat.karno"))
mydata_cut <- surv_categorize(cutpoints)
# Use cbind to merge mydata_cut with other columns of mydata
mydata <- cbind(mydata[, c("sex", "ph.ecog", "wt.loss")], mydata_cut)
# Check the structure of the merged dataset
str(mydata)
'data.frame': 167 obs. of 9 variables:
$ sex : chr "Male" "Male" "Male" "Female" ...
$ ph.ecog : Factor w/ 4 levels "0","1","2","3": 1 2 2 3 3 2 3 2 2 2 ...
$ wt.loss : num 15 11 0 10 1 16 34 27 60 -5 ...
$ time : num 455 210 1022 310 361 ...
$ status : num 1 1 0 1 1 1 1 1 1 1 ...
$ age : chr "high" "low" "high" "high" ...
$ meal.cal : chr "high" "high" "high" "low" ...
$ ph.karno : chr "high" "high" "low" "low" ...
$ pat.karno: chr "high" "low" "high" "low" ...
# Subgroup analysis
subgroup_res <- TableSubgroupMultiCox(
formula = Surv(time, status) ~ sex,
var_subgroups = c("age", "meal.cal", "ph.karno", "pat.karno"),
data = mydata
)
write.csv(subgroup_res, "results/Other_microbiome_analysis/subgroup_analysis_results.csv")
# Analysis of differences among groups in environmental factors
env = read.csv("./data/env.csv")
data = as.data.frame(env)
data = as.tibble(data)
result = EasyStat::MuiaovMcomper2(data = data,num = c(3:ncol(data)))
result1 = EasyStat::FacetMuiPlotresultBox(data = data,
num = c(3:6),
result = result,
sig_show ="abc",ncol = 5 )
mytheme1 = theme_classic() + theme(
panel.background=element_blank(),
panel.grid=element_blank(),
legend.position="right",
legend.title = element_blank(),
legend.background=element_blank(),
legend.key=element_blank(),
# legend.text= element_text(size=7),
# text=element_text(),
# axis.text.x=element_text(angle=45,vjust=1, hjust=1)
plot.title = element_text(vjust = -8.5,hjust = 0.1),
axis.title.y =element_text(size = 15,face = "bold",colour = "black"),
axis.title.x =element_text(size = 15,face = "bold",colour = "black"),
axis.text = element_text(size = 10,face = "bold"),
axis.text.x = element_text(colour = "black",size = 10),
axis.text.y = element_text(colour = "black",size = 10),
legend.text = element_text(size = 10,face = "bold")
)
colset1 <- RColorBrewer::brewer.pal(9,"Set1")
p1_1 = result1[[1]] +
mytheme1 +
ggplot2::guides(fill = guide_legend(title = NULL)) +
ggplot2::scale_fill_manual(values = colset1)
p1_1
p1_1 = result1[[2]] %>% ggplot(aes(x=group , y=dd )) +
geom_violin(alpha=1, aes(fill=group)) +
geom_jitter( aes(color = group),position=position_jitter(0.17), size=3, alpha=0.5)+
labs(x="", y="")+
facet_wrap(.~name,scales="free_y",ncol = 3) +
# theme_classic()+
geom_text(aes(x=group , y=y ,label=stat)) +
# ggplot2::scale_x_discrete(limits = axis_order) +
mytheme1 +
guides(color=guide_legend(title = NULL),
shape=guide_legend(title = NULL),
fill = guide_legend(title = NULL)
) +
ggplot2::scale_fill_manual(values = colset1)
p1_1
res = EasyStat::FacetMuiPlotresultBar(data = data,num = c(3:ncol(data)),result = result,sig_show ="abc",ncol = 5)
p1_2 = res[[1]]+
# scale_x_discrete(limits = axis_order) +
guides(color = FALSE) +
mytheme1+
guides(fill = guide_legend(title = NULL))+
scale_fill_manual(values = colset1)
p1_2
res = EasyStat::FacetMuiPlotReBoxBar(data = data,num = c(3:ncol(data)),result = result,sig_show ="abc",ncol = 5)
p1_3 = res[[1]]+
# scale_x_discrete(limits = axis_order) +
mytheme1 +
guides(fill = guide_legend(title = NULL))+
scale_fill_manual(values = colset1)
p1_3
gnum = 4
FileName <- paste("results/Other_microbiome_analysis/env_diff/env_Facet_box", ".pdf", sep = "")
ggsave(FileName, p1_1, width = ((1 + gnum) * 3), height =4*gnum,limitsize = FALSE)
FileName <- paste("results/Other_microbiome_analysis/env_diff/env_Facet_bar", ".pdf", sep = "")
ggsave(FileName, p1_2, width = ((1 + gnum) * 3), height = 4*gnum,limitsize = FALSE)
FileName <- paste("results/Other_microbiome_analysis/env_diff/env_Facet_boxbar", ".pdf", sep = "")
ggsave(FileName, p1_3, width = ((1 + gnum) * 3), height = 4*gnum,limitsize = FALSE)
FileName <- paste("results/Other_microbiome_analysis/env_diff/env_Facet_box", ".jpg", sep = "")
ggsave(FileName, p1_1, width = ((1 + gnum) * 3), height =4*gnum,limitsize = FALSE)
FileName <- paste("results/Other_microbiome_analysis/env_diff/env_Facet_bar", ".jpg", sep = "")
ggsave(FileName, p1_2, width = ((1 + gnum) * 3), height = 4*gnum,limitsize = FALSE)
FileName <- paste("results/Other_microbiome_analysis/env_diff/env_Facet_boxbar", ".jpg", sep = "")
ggsave(FileName, p1_3, width = ((1 + gnum) * 3), height = 4*gnum,limitsize = FALSE)
# Load packages
library(vegan)
library(ggrepel)
# Load data
set.seed(365)
env = read.csv("./data/env2.csv")
data1 <- env[, 3:36]
data1 <- decostand(data1, method = "hellinger")
data2 = as.data.frame(env$group)
res <- rda(data1 ~ . , data2)
# Select RDA1, RDA2 axis for visualization
centroids <- as.data.frame(res$CCA$centroids[,c(1,2)])
# Add a column to the centroids data frame to distinguish shapes and color them
centroids$group <- factor(c(rep('A',2),rep('B',2)), levels = c('A', 'B'))
rda.v <- as.data.frame(res$CCA$v[,c(1,2)])
# Extract the response variable name for displaying labels in the plot
rda.v$name = row.names(rda.v)
arrow_data <- data.frame(x=rda.v[,1], y = rda.v[,2], x_end=0, y_end=0, name=rda.v[,3], col='blue')
arrow_data[arrow_data$name %in% c('AP', 'LB', 'LRL1', 'RGR'), ]$col <- 'red'
p1 <- ggplot(data = centroids) +
geom_point(size=2, aes(x = RDA1,y=RDA2,color=group, shape=group))+
ggrepel::geom_text_repel(data = arrow_data, aes(x,y,label=name),
size=3, fontface="italic")+
geom_segment(data = arrow_data,
aes(x=0, y=0, xend=x, yend=y),
arrow = arrow(length = unit(0.05,"inches")), color = arrow_data$col, size=.8)+
geom_hline(yintercept = 0, linetype = "dashed", size=1.2) +
geom_vline(xintercept = 0,linetype = "dashed", size=1.2)+
theme(legend.title = element_blank(), legend.position = c(0.9,0.8),
legend.background = element_blank()) +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), axis.line = element_line(colour = "black"),
panel.border =element_rect(colour = "black", fill=NA, size=1)) +
labs(x = 'RDA1(0.734%)',y = 'RDA2(0.048%)')
p1
ggsave('results/Other_microbiome_analysis/RDA_plot.pdf', p1, height = 6, width = 7)
# Load packages
library(vegan)
library(ggrepel)
library(ggplot2)
# Before conducting the ordination analysis, we need to perform a detrended correspondence analysis (DCA) on the species community data and make a judgment based on the value of Lengths of gradient in the result. The result will give 4 Lengths of gradient values. If the largest value is greater than 4.0, choose CCA. If it is between 3.0-4.0, choose RDA or CCA. If it is less than 3.0, the result of RDA is better than CCA. However, this standard is not 100% appropriate. In actual use, it is best to perform CCA and RDA at the same time and make a choice based on the results.
# Loading sample species community data
# sampledata <- read.table("otu_table.txt", head = TRUE, row.names=1,sep="\t")
sampledata <- read.csv("data/test_otu.csv",row.names = 1)
sampledata <- sampledata[ , 1:24]
# Analysis requires data table rows as samples, columns as species
sampledata <- t(sampledata)
# DCA analysis
dca <- decorana(veg = sampledata)
# Call up the Lengths of gradient value and save it
dca1 <- max(dca$rproj[,1])
dca2 <- max(dca$rproj[,2])
dca3 <- max(dca$rproj[,3])
dca4 <- max(dca$rproj[,4])
GL <- data.frame(DCA1 = c(dca1), DCA2 = c(dca2), DCA3 = c(dca3), DCA4 = c(dca4))
rownames(GL) <- c("Gradient length")
write.csv(GL, file = "results/Other_microbiome_analysis/dca.csv")
# redundancy analysis (RDA) or canonical correspondence analysis (CCA)
# RDA combines correspondence analysis with multiple regression analysis, and each step of the calculation is regressed with environmental factors.
# CCA is a ranking method based on a unimodal model and combines multiple environmental factors in the ranking process.
# RDA is based on a linear model, while CCA is based on a unimodal model. You can choose which analysis method gives you the best community ranking results.
design<-read.csv("data/test_design.csv",row.names = 1)
otu_tax<-read.csv("data/test_otu.csv",row.names = 1)
# Correlating microorganisms with environmental factors, CCA analysis
cca <- cca(t(otu_tax[200:300,1:24]), design[,6:11], scale = TRUE)
ccascore <- scores(cca)
# Or RDA analysis
RDA = rda(t(otu_tax[200:300,1:24]),design[,6:11], scale = TRUE)
# Get the first and second axes of RDA. Here, RDA is not visualized, but CCA is used as an example for visualization.
RDA_sample<- scores(RDA,choices = 1:2, display = 'sp')
RDA_env<-RDA$CCA$biplot[,1:2]
# Get the first and second axes of CCA
CCAE <- as.data.frame(cca$CCA$biplot[,1:2])
CCA1 <- ccascore$sites[,1]
CCA2 <- ccascore$sites[,2]
# Construct a data frame with the first and second axes of the samples
plotdata <- data.frame(rownames(ccascore$sites), CCA1, CCA2)
colnames(plotdata) <- c("sample","CCA1","CCA2")
plotdata<-cbind(plotdata,design)
# Get the explanation percentage of the first and second axes, and keep one decimal place
cca1 <- round(cca$CCA$eig[1]/sum(cca$CCA$eig)*100,1)
cca2 <- round(cca$CCA$eig[2]/sum(cca$CCA$eig)*100,1)
# Plot CCA results
p1 <- ggplot(plotdata,aes(x=CCA1,y=CCA2,color=plotdata$Treatment))+
geom_point(size=4,aes())+
stat_ellipse(aes(fill=plotdata$Treatment),geom = "polygon",size=0.6,level = 0.95,alpha = 0.1)+
geom_segment(data=CCAE,aes(x = 0, y = 0, xend = CCAE[,1]*3.5, yend = CCAE[,2]*3.5),
arrow = arrow(length = unit(0.03, 'npc')),size =1,color="red")+
geom_text(data = CCAE,aes(CCA1 * 4,CCA2 * 4,label = rownames(CCAE)),color = 'red',size = 4)+
xlab(paste("CCA1 (",cca1,"%",")"))+ylab(paste("CCA2 (",cca2,"%",")"))+
theme_bw(base_line_size = 1.05,base_rect_size = 1.05)+
scale_color_manual(values=c("#3FBDA7","#0172B6","#BD3C29","#F0965D"))+
scale_fill_manual(values=c("#3FBDA7","#0172B6","#BD3C29","#F0965D"))+
theme(panel.grid.major=element_blank(),panel.grid.minor=element_blank())+
geom_hline(aes(yintercept=0), colour="gray45",size=0.8, linetype="dashed")+
geom_vline(aes(xintercept=0), colour="gray45",size=0.8, linetype="dashed")
p1
ggsave("results/Other_microbiome_analysis/CCA_plot.pdf", p1, width = 8, height = 6)
# Load packages and data
library(rdacca.hp)
data(mite) # Oribatid mite species abundance matrix at the observation site
mite[1:6,1:6]
Brachy PHTH HPAV RARD SSTR Protopl
1 17 5 5 3 2 1
2 2 7 16 0 6 0
3 4 3 1 1 2 0
4 23 7 10 2 2 0
5 5 8 13 9 0 13
6 19 7 5 9 3 2
#write.csv(mite,"mite.csv")
data(mite.env) # Environmental variable matrix of observation sites
#write.csv(mite.env,"miteenv.csv")
data(mite.xy) # Geographic coordinates of the observation site
#write.csv(mite.xy,"mitexy.csv")
# RDA
# mite <- read.csv(file.choose(),header = T,row.names = 1)
mite.hel <- decostand(mite, method = 'hellinger')
mite.rda <- rda(mite.hel~., mite.env, scale = FALSE)
summary(mite.rda)
Call:
rda(formula = mite.hel ~ SubsDens + WatrCont + Substrate + Shrub + Topo, data = mite.env, scale = FALSE)
Partitioning of variance:
Inertia Proportion
Total 0.394 1.000
Constrained 0.208 0.527
Unconstrained 0.187 0.473
Eigenvalues, and their contribution to the variance
Importance of components:
RDA1 RDA2 RDA3 RDA4 RDA5 RDA6 RDA7 RDA8
Eigenvalue 0.137 0.0247 0.0138 0.0112 0.0083 0.00548 0.00211 0.00189
Proportion Explained 0.348 0.0626 0.0351 0.0284 0.0210 0.01391 0.00534 0.00480
Cumulative Proportion 0.348 0.4108 0.4459 0.4742 0.4953 0.50920 0.51454 0.51934
RDA9 RDA10 RDA11 PC1 PC2 PC3 PC4
Eigenvalue 0.00127 0.000983 0.00057 0.0427 0.0238 0.0172 0.0137
Proportion Explained 0.00323 0.002494 0.00144 0.1084 0.0605 0.0436 0.0347
Cumulative Proportion 0.52257 0.525060 0.52650 0.6349 0.6954 0.7390 0.7737
PC5 PC6 PC7 PC8 PC9 PC10 PC11
Eigenvalue 0.0116 0.00968 0.00769 0.00727 0.00674 0.00527 0.00478
Proportion Explained 0.0293 0.02456 0.01950 0.01844 0.01708 0.01336 0.01213
Cumulative Proportion 0.8030 0.82761 0.84711 0.86555 0.88263 0.89599 0.90812
PC12 PC13 PC14 PC15 PC16 PC17 PC18
Eigenvalue 0.00439 0.0040 0.00353 0.00294 0.00260 0.00251 0.00222
Proportion Explained 0.01115 0.0102 0.00895 0.00746 0.00658 0.00638 0.00563
Cumulative Proportion 0.91927 0.9294 0.93838 0.94583 0.95242 0.95879 0.96443
PC19 PC20 PC21 PC22 PC23 PC24 PC25
Eigenvalue 0.00210 0.00194 0.00171 0.0013 0.0011 0.00102 0.000923
Proportion Explained 0.00533 0.00491 0.00435 0.0033 0.0028 0.00258 0.002342
Cumulative Proportion 0.96976 0.97467 0.97901 0.9823 0.9851 0.98769 0.990031
PC26 PC27 PC28 PC29 PC30 PC31
Eigenvalue 0.000806 0.000662 0.000545 0.000478 0.000461 0.000367
Proportion Explained 0.002045 0.001680 0.001383 0.001211 0.001168 0.000932
Cumulative Proportion 0.992076 0.993756 0.995139 0.996351 0.997519 0.998451
PC32 PC33 PC34 PC35
Eigenvalue 0.000237 0.000178 0.000138 5.74e-05
Proportion Explained 0.000601 0.000451 0.000351 1.45e-04
Cumulative Proportion 0.999052 0.999504 0.999855 1.00e+00
Accumulated constrained eigenvalues
Importance of components:
RDA1 RDA2 RDA3 RDA4 RDA5 RDA6 RDA7 RDA8
Eigenvalue 0.137 0.0247 0.0138 0.0112 0.0083 0.00548 0.00211 0.00189
Proportion Explained 0.661 0.1188 0.0666 0.0539 0.0400 0.02642 0.01015 0.00912
Cumulative Proportion 0.661 0.7803 0.8469 0.9007 0.9407 0.96712 0.97727 0.98639
RDA9 RDA10 RDA11
Eigenvalue 0.00127 0.000983 0.00057
Proportion Explained 0.00613 0.004738 0.00274
Cumulative Proportion 0.99252 0.997256 1.00000
plot(mite.rda)
# Hierarchical segmentation
mite.rda.hp <- rdacca.hp(mite.hel, mite.env, method = 'RDA', type = 'adjR2', scale = FALSE)
mite.rda.hp
$Method_Type
[1] "RDA" "adjR2"
$Total_explained_variation
[1] 0.437
$Hier.part
Unique Average.share Individual I.perc(%)
SubsDens 0.0363 -0.0043 0.0320 7.32
WatrCont 0.1009 0.0633 0.1642 37.57
Substrate 0.0416 0.0127 0.0543 12.43
Shrub 0.0288 0.0730 0.1018 23.30
Topo 0.0465 0.0378 0.0843 19.29
attr(,"class")
[1] "rdaccahp"
plot(mite.rda.hp)
# Save hierarchical segmentation results
mite.rda.hp$Hier.part
Unique Average.share Individual I.perc(%)
SubsDens 0.0363 -0.0043 0.0320 7.32
WatrCont 0.1009 0.0633 0.1642 37.57
Substrate 0.0416 0.0127 0.0543 12.43
Shrub 0.0288 0.0730 0.1018 23.30
Topo 0.0465 0.0378 0.0843 19.29
write.csv(mite.rda.hp$Hier.part, 'results/Other_microbiome_analysis/mite.rda.hp.csv')
# CCA
mite.cca <- cca(mite~., mite.env)
summary(mite.cca)
Call:
cca(formula = mite ~ SubsDens + WatrCont + Substrate + Shrub + Topo, data = mite.env)
Partitioning of scaled Chi-square:
Inertia Proportion
Total 1.696 1.000
Constrained 0.799 0.471
Unconstrained 0.897 0.529
Eigenvalues, and their contribution to the scaled Chi-square
Importance of components:
CCA1 CCA2 CCA3 CCA4 CCA5 CCA6 CCA7 CCA8
Eigenvalue 0.44 0.132 0.0737 0.0453 0.0397 0.0231 0.0179 0.01059
Proportion Explained 0.26 0.078 0.0434 0.0267 0.0234 0.0136 0.0106 0.00625
Cumulative Proportion 0.26 0.338 0.3811 0.4078 0.4312 0.4448 0.4554 0.46165
CCA9 CCA10 CCA11 CA1 CA2 CA3 CA4
Eigenvalue 0.00827 0.00581 0.00209 0.1280 0.1185 0.0966 0.0786
Proportion Explained 0.00488 0.00342 0.00123 0.0755 0.0699 0.0569 0.0463
Cumulative Proportion 0.46653 0.46995 0.47119 0.5467 0.6165 0.6735 0.7198
CA5 CA6 CA7 CA8 CA9 CA10 CA11 CA12
Eigenvalue 0.0632 0.0546 0.0427 0.0345 0.0317 0.0290 0.0269 0.0230
Proportion Explained 0.0372 0.0322 0.0252 0.0203 0.0187 0.0171 0.0159 0.0135
Cumulative Proportion 0.7570 0.7892 0.8144 0.8347 0.8534 0.8706 0.8864 0.9000
CA13 CA14 CA15 CA16 CA17 CA18 CA19
Eigenvalue 0.0202 0.0187 0.0166 0.0156 0.01375 0.01259 0.01225
Proportion Explained 0.0119 0.0111 0.0098 0.0092 0.00811 0.00742 0.00722
Cumulative Proportion 0.9119 0.9229 0.9327 0.9419 0.95002 0.95744 0.96467
CA20 CA21 CA22 CA23 CA24 CA25 CA26
Eigenvalue 0.00964 0.00782 0.00726 0.00607 0.00539 0.00515 0.00418
Proportion Explained 0.00568 0.00461 0.00428 0.00358 0.00318 0.00304 0.00246
Cumulative Proportion 0.97035 0.97496 0.97924 0.98282 0.98600 0.98904 0.99150
CA27 CA28 CA29 CA30 CA31 CA32 CA33
Eigenvalue 0.00361 0.00284 0.0022 0.00192 0.001620 0.001012 0.000860
Proportion Explained 0.00213 0.00167 0.0013 0.00113 0.000955 0.000597 0.000507
Cumulative Proportion 0.99363 0.99530 0.9966 0.99773 0.998690 0.999287 0.999794
CA34
Eigenvalue 0.000349
Proportion Explained 0.000206
Cumulative Proportion 1.000000
Accumulated constrained eigenvalues
Importance of components:
CCA1 CCA2 CCA3 CCA4 CCA5 CCA6 CCA7 CCA8
Eigenvalue 0.440 0.132 0.0737 0.0453 0.0397 0.0231 0.0179 0.0106
Proportion Explained 0.551 0.166 0.0922 0.0567 0.0497 0.0289 0.0224 0.0133
Cumulative Proportion 0.551 0.717 0.8088 0.8655 0.9152 0.9441 0.9665 0.9798
CCA9 CCA10 CCA11
Eigenvalue 0.00827 0.00581 0.00209
Proportion Explained 0.01035 0.00727 0.00261
Cumulative Proportion 0.99012 0.99739 1.00000
# plot(mite.cca)
# Hierarchical segmentation
set.seed(123)
mite.cca.hp <- rdacca.hp(mite, mite.env, method = 'CCA', type = 'adjR2', scale = FALSE, n.perm = 1000)
mite.cca.hp
$Method_Type
[1] "CCA" "adjR2"
$Total_explained_variation
[1] 0.338
$Hier.part
Unique Average.share Individual I.perc(%)
SubsDens 0.0283 0.0061 0.0344 10.2
WatrCont 0.0871 0.0567 0.1438 42.5
Substrate 0.0423 -0.0015 0.0408 12.1
Shrub 0.0154 0.0571 0.0725 21.4
Topo 0.0257 0.0207 0.0464 13.7
attr(,"class")
[1] "rdaccahp"
plot(mite.cca.hp)
# Save hierarchical segmentation results
mite.cca.hp$Hier.part
Unique Average.share Individual I.perc(%)
SubsDens 0.0283 0.0061 0.0344 10.2
WatrCont 0.0871 0.0567 0.1438 42.5
Substrate 0.0423 -0.0015 0.0408 12.1
Shrub 0.0154 0.0571 0.0725 21.4
Topo 0.0257 0.0207 0.0464 13.7
write.csv(mite.cca.hp$Hier.part, 'results/Other_microbiome_analysis/mite.cca.hp.csv')
# db-RDA
mite.bray <- vegdist(mite, method = 'bray')
mite.cap <- dbrda(mite.bray~., mite.env)
summary(mite.cap)
Call:
dbrda(formula = mite.bray ~ SubsDens + WatrCont + Substrate + Shrub + Topo, data = mite.env)
Partitioning of squared Bray distance:
Inertia Proportion
Total 14.70 1.000
Constrained 7.42 0.505
Unconstrained 7.27 0.495
Eigenvalues, and their contribution to the squared Bray distance
Importance of components:
dbRDA1 dbRDA2 dbRDA3 dbRDA4 dbRDA5 dbRDA6 dbRDA7 dbRDA8
Eigenvalue 4.603 0.9157 0.6740 0.4007 0.2560 0.2451 0.1516 0.10267
Proportion Explained 0.313 0.0623 0.0459 0.0273 0.0174 0.0167 0.0103 0.00699
Cumulative Proportion 0.313 0.3755 0.4214 0.4486 0.4660 0.4827 0.4930 0.50002
dbRDA9 dbRDA10 dbRDA11 MDS1 MDS2 MDS3 MDS4 MDS5
Eigenvalue 0.0573 0.01640 1.22e-03 1.977 0.9078 0.750 0.6545 0.5565
Proportion Explained 0.0039 0.00112 8.29e-05 0.135 0.0618 0.051 0.0445 0.0379
Cumulative Proportion 0.5039 0.50504 5.05e-01 0.640 0.7014 0.752 0.7970 0.8348
MDS6 MDS7 MDS8 MDS9 MDS10 MDS11 MDS12 MDS13
Eigenvalue 0.4746 0.4329 0.3899 0.3268 0.2802 0.2448 0.2248 0.1931
Proportion Explained 0.0323 0.0295 0.0265 0.0222 0.0191 0.0167 0.0153 0.0131
Cumulative Proportion 0.8671 0.8966 0.9231 0.9453 0.9644 0.9811 0.9964 1.0095
MDS14 MDS15 MDS16 MDS17 MDS18 MDS19 MDS20
Eigenvalue 0.1727 0.1570 0.1338 0.11045 0.1088 0.10508 0.0896
Proportion Explained 0.0118 0.0107 0.0091 0.00752 0.0074 0.00715 0.0061
Cumulative Proportion 1.0213 1.0319 1.0410 1.04856 1.0560 1.06311 1.0692
MDS21 MDS22 MDS23 MDS24 MDS25 MDS26 MDS27
Eigenvalue 0.08357 0.07386 0.05919 0.04980 0.04607 0.04064 0.03607
Proportion Explained 0.00569 0.00503 0.00403 0.00339 0.00313 0.00277 0.00245
Cumulative Proportion 1.07489 1.07992 1.08395 1.08734 1.09047 1.09324 1.09569
MDS28 MDS29 MDS30 MDS31 MDS32 MDS33 MDS34
Eigenvalue 0.02908 0.02638 0.01856 0.01629 0.010561 0.005755 0.00205
Proportion Explained 0.00198 0.00179 0.00126 0.00111 0.000719 0.000392 0.00014
Cumulative Proportion 1.09767 1.09946 1.10073 1.10184 1.102554 1.102945 1.10309
iMDS1 iMDS2 iMDS3 iMDS4 iMDS5 iMDS6
Eigenvalue -0.003869 -0.00544 -0.011558 -0.01721 -0.02214 -0.02548
Proportion Explained -0.000263 -0.00037 -0.000786 -0.00117 -0.00151 -0.00173
Cumulative Proportion 1.102822 1.10245 1.101665 1.10049 1.09899 1.09725
iMDS7 iMDS8 iMDS9 iMDS10 iMDS11 iMDS12
Eigenvalue -0.02838 -0.03452 -0.04007 -0.04078 -0.04661 -0.05027
Proportion Explained -0.00193 -0.00235 -0.00273 -0.00278 -0.00317 -0.00342
Cumulative Proportion 1.09532 1.09297 1.09025 1.08747 1.08430 1.08088
iMDS13 iMDS14 iMDS15 iMDS16 iMDS17 iMDS18
Eigenvalue -0.05255 -0.05617 -0.0617 -0.06241 -0.06648 -0.07303
Proportion Explained -0.00358 -0.00382 -0.0042 -0.00425 -0.00452 -0.00497
Cumulative Proportion 1.07730 1.07348 1.0693 1.06504 1.06051 1.05555
iMDS19 iMDS20 iMDS21 iMDS22 iMDS23 iMDS24
Eigenvalue -0.08062 -0.08916 -0.10482 -0.11435 -0.13417 -0.293
Proportion Explained -0.00549 -0.00607 -0.00713 -0.00778 -0.00913 -0.020
Cumulative Proportion 1.05006 1.04399 1.03686 1.02908 1.01995 1.000
Accumulated constrained eigenvalues
Importance of components:
dbRDA1 dbRDA2 dbRDA3 dbRDA4 dbRDA5 dbRDA6 dbRDA7 dbRDA8
Eigenvalue 4.60 0.916 0.6740 0.401 0.2560 0.245 0.1516 0.1027
Proportion Explained 0.62 0.123 0.0908 0.054 0.0345 0.033 0.0204 0.0138
Cumulative Proportion 0.62 0.743 0.8342 0.888 0.9226 0.956 0.9761 0.9899
dbRDA9 dbRDA10 dbRDA11
Eigenvalue 0.05735 0.01640 0.001218
Proportion Explained 0.00773 0.00221 0.000164
Cumulative Proportion 0.99763 0.99984 1.000000
# plot(mite.cap)
# Hierarchical segmentation
mite.cap.hp <- rdacca.hp(mite.bray, mite.env, method = 'dbRDA', type = 'adjR2', scale = FALSE)
mite.cap.hp
$Method_Type
[1] "dbRDA" "adjR2"
$Total_explained_variation
[1] 0.356
$Hier.part
Unique Average.share Individual I.perc(%)
SubsDens 0.0296 -0.0055 0.0241 6.77
WatrCont 0.0749 0.0464 0.1213 34.07
Substrate 0.0474 0.0059 0.0533 14.97
Shrub 0.0315 0.0538 0.0853 23.96
Topo 0.0420 0.0305 0.0725 20.37
attr(,"class")
[1] "rdaccahp"
plot(mite.cap.hp)
# Save hierarchical segmentation results
mite.cap.hp$Hier.part
Unique Average.share Individual I.perc(%)
SubsDens 0.0296 -0.0055 0.0241 6.77
WatrCont 0.0749 0.0464 0.1213 34.07
Substrate 0.0474 0.0059 0.0533 14.97
Shrub 0.0315 0.0538 0.0853 23.96
Topo 0.0420 0.0305 0.0725 20.37
write.csv(mite.cap.hp$Hier.part, 'results/Other_microbiome_analysis/mite.cap.hp.csv')
# Significance test of variable contribution in hierarchical segmentation
# Taking the results of RDA as an example, the significance of the interpretability of each environmental factor is obtained based on 999 permutation tests.
set.seed(123)
permu_hp <- permu.hp(dv = mite.hel, iv = mite.env, method = 'RDA', type = 'adjR2', permutations = 999)
Please wait: running 998 permutations
permu_hp
Individual Pr(>I)
SubsDens 0.0320 0.018 *
WatrCont 0.1642 0.001 ***
Substrate 0.0543 0.017 *
Shrub 0.1018 0.001 ***
Topo 0.0843 0.001 ***
# Save results
write.csv(permu_hp, 'results/Other_microbiome_analysis/permu_hp.csv')
# Plot
permu_hp$Variables <- rownames(permu_hp)
permu_hp$p <- unlist(lapply(as.character(permu_hp$'Pr(>I)'), function(x) unlist(strsplit(x, ' '))[2]))
library(ggplot2)
ggplot(permu_hp, aes(Variables, Individual)) +
geom_col() +
geom_text(aes(label = p), vjust = -0.3)
# Load packages
library(BiocManager)
# BiocManager::install("microbiome")
# BiocManager::install("DirichletMultinomial")
# BiocManager::install("reshape2")
# BiocManager::install("magrittr",force = TRUE)
library(dplyr)
library(microbiome)
library(DirichletMultinomial)
library(reshape2)
library(magrittr)
# Load data
data("dietswap")
# Contains 52 microorganisms, 222 samples, and the sample information file has 8 columns
pseq<-dietswap
# Only core groups are selected for analysis
pseq.comp<-microbiome::transform(pseq,"compositional")
# The core group indicator should be a relative abundance of 0.1% in 50% of the samples
taxa<-core_members(pseq.comp,detection = 0.1/100,prevalence = 50/100)
pseq<-prune_taxa(taxa,pseq) # Remove unwanted OTUs or groups from the phylogroup
# There are three analysis groups here, and we model according to this preset divided into three parts
map = sample_data(pseq)
head(map)
subject sex nationality group sample timepoint
Sample-1 byn male AAM DI Sample-1 4
Sample-2 nms male AFR HE Sample-2 2
Sample-3 olt male AFR HE Sample-3 2
Sample-4 pku female AFR HE Sample-4 2
Sample-5 qjy female AFR HE Sample-5 2
Sample-6 riv female AFR HE Sample-6 2
timepoint.within.group bmi_group
Sample-1 1 obese
Sample-2 1 lean
Sample-3 1 overweight
Sample-4 1 obese
Sample-5 1 overweight
Sample-6 1 obese
map$group %>% unique()
[1] DI HE ED
Levels: DI ED HE
# The richness of core species in 222 samples, behavioral species names, listed as sample corresponding richness
dat<-microbiome::abundances(pseq)
# Transpose the abundance data of the core group and convert it into a matrix, with the rows being the number of samples and the columns being the different species names.
count<-as.matrix(t(dat))
# lapply function calls function dmn to fit a polynomial model to the sample count matrix
fit<-lapply(1:4,dmn,count=count,verbose=TRUE)
Soft kmeans
Expectation Maximization setup
Expectation Maximization
Hessian
Soft kmeans
iteration 10 change 0.000080
Expectation Maximization setup
Expectation Maximization
iteration 10 change 0.000000
Hessian
Soft kmeans
iteration 10 change 0.000418
Expectation Maximization setup
Expectation Maximization
iteration 10 change 0.000063
Hessian
Soft kmeans
iteration 10 change 0.000259
iteration 20 change 0.000004
Expectation Maximization setup
Expectation Maximization
iteration 10 change 0.414533
iteration 20 change 0.014936
Hessian
# Judging the fitting effect
lplc<-sapply(fit,laplace) # laplace gets the parameters of the fitted model
aic<-sapply(fit,AIC)
# When estimating parameters using the maximum likelihood method, the smaller the AIC and BIC, the better the fit.
bic<-sapply(fit,BIC)
# Select best model
# unlist selects the model with the smallest parameter from all parameter lists
best<-fit[[which.min(unlist(lplc))]]
# Get the parameter pi and theta values of the best fitting model
mixturewt(best)
pi theta
1 0.324 80.2
2 0.265 119.5
3 0.234 71.4
4 0.177 338.0
# Save best model results
ass<-apply(mixture(best),1,which.max)
write.csv(ass, file="results/Other_microbiome_analysis/DMM_3clusters_L6.csv")
for(k in seq(ncol(fitted(best)))){
d<-melt(fitted(best))
colnames(d)<-c("OTU","cluster","value")
d <- subset(d, cluster == k) %>%
# Arrange OTUs by assignment strength
arrange(value) %>%
mutate(OTU = factor(OTU, levels = unique(OTU))) %>%
filter(abs(value) > quantile(abs(value), 0.8))
p <- ggplot(d, aes(x = OTU, y = value)) +
geom_bar(stat = "identity") +
coord_flip() +
labs(title = paste("Top drivers: community type", k))
print(p)
}
d
OTU cluster value
1 Sporobacter termitidis et rel. 4 2.42
2 Subdoligranulum variable at rel. 4 2.57
3 Ruminococcus obeum et rel. 4 2.59
4 Butyrivibrio crossotus et rel. 4 2.77
5 Clostridium symbiosum et rel. 4 3.10
6 Clostridium cellulosi et rel. 4 3.29
7 Bacteroides vulgatus et rel. 4 4.17
8 Oscillospira guillermondii et rel. 4 9.03
9 Faecalibacterium prausnitzii et rel. 4 9.83
10 Prevotella oralis et rel. 4 32.80
11 Prevotella melaninogenica et rel. 4 224.42
# How to choose the best k value
data(fit)
lplc <- sapply(fit, laplace)
#plot(lplc, type="b")
fit[[which.min(lplc)]]
class: DMN
k: 4
samples x taxa: 278 x 130
Laplace: 38781 BIC: 40425 AIC: 39477
lplc2 <- as.data.frame(lplc)
lplc2$cluster <- rownames(lplc2)
p01_DMM <- ggplot(data = lplc2, aes(x=cluster,y=lplc, group = 1))+
geom_point()+
geom_line(color = "lightblue")+
xlab("Number of clusters k")+
ylab("Laplace approximation")+
theme_classic() +
geom_vline(xintercept = 4, colour='black', lwd=0.36, linetype="dashed")+
theme(panel.grid.major=element_line(colour=NA),
panel.background = element_rect(fill = "transparent",colour = NA),
plot.background = element_rect(fill = "transparent",colour = NA),
plot.title = element_text(hjust = 0.5,size = 15),
panel.grid.minor = element_blank(),
text = element_text(family = "sans"),
axis.text.x = element_text(hjust = 0.5,size = 10),
axis.text.y = element_text(hjust = 0.5,size = 10),
axis.title.y = element_text(size = 15),
axis.title.x = element_text(size = 15),
legend.text = element_text(size = 15),
legend.position = c(.92,.72),
legend.box.background = element_rect(color="black"))#+
#scale_x_continuous(limits = c(2000,2014),breaks = seq(2000,2014,1))
#(p01_DMM)
p01_DMM
# load data
design<-read.csv("data/test_design.csv",row.names = 1)
otu<-read.csv("data/test_otu.csv",row.names = 1)
# load packages
library(linkET)
library(ggplot2)
library(dplyr)
library(vegan)
library(reshape2)
library(RColorBrewer)
library(dplyr)
# Extract out and subsequently calculate beta diversity
Bacteria<-as.data.frame(t(otu[1:200,1:24]))
# Extract environmental factors. This is a random setting with no practical significance. Environmental factors or biological factors can be added based on personal data.。
env_beta<-otu[840:863,7:19]
colnames(env_beta)<-c("SOC","TN","AP","TP","TK","AK","NO3","Lat","Lon","MAT",
"MAP","pH","Fe")
# Adjust data for visualization
env_beta[1:12,1:8]=env_beta[1:12,1:8]+10
env_beta[1:12,9:13]=env_beta[1:12,9:13]-8
# Perform mantel analysis to obtain mantel p-value and r-value, and perform subsequent visualization
mantel <- mantel_test(Bacteria, env_beta,spec_select = list(Bacteria=1:200)) %>%
mutate(rd = cut(r, breaks = c(-Inf, 0.2, 0.4, Inf),labels = c("< 0.2", "0.2 - 0.4", ">= 0.4")),
pd = cut(p, breaks = c(-Inf, 0.01, 0.05, Inf),labels = c("< 0.01", "0.01 - 0.05", ">= 0.05")))
# Visualization, add the upper triangular matrix of environmental factors through qcorrplot(correlate(env_beta)
p1=qcorrplot(correlate(env_beta),type = "upper",diag = F)+
geom_square()+
geom_couple(aes(colour = pd,size = rd),data = mantel,curvature = nice_curvature())+
scale_size_manual(values = c(1.5,3,4.5))+
scale_fill_gradientn(colours = RColorBrewer::brewer.pal(11, "RdBu"))+
scale_colour_manual(values = c("#8B30B5","#5AB328","gray88"))+
guides(size = guide_legend(title = "Mantel's r",override.aes = list(colour = "grey35"),order = 2),
colour = guide_legend(title = "Mantel's p", override.aes = list(size = 3),order = 1),
fill = guide_colorbar(title = "Pearson's r", order = 3))
# Draw a heat map and select data for visualization
pp<-otu[1011:1024,1:10]
pp[pp>=8]=100
pp[pp>=7&pp<=8]=50
# When designing microbial and environmental factor data, it is assumed that it is a correlation coefficient. In fact, you need to calculate the correlation coefficient yourself (based on the significance results of correlation analysis Spearman and Pearson's r value)
rownames(pp)<-c(paste("Bacteria",1:14))
colnames(pp)<-c(paste("Soil",1:10))
# Convert to a long list required for ggplot drawing
io<-pp%>% mutate(pp=row.names(.))%>%melt()
df<-cbind(io,io$value)
# Add significance and mark it with an asterisk. You actually need to add it yourself (based on the p-value of Spearman and Pearson of the relevant analysis significance results). This is only for drawing, and the data is meaningless.
colnames(df)<-c("pp","variable","value","sig")
# Draw a heat map, add a square color system through geom_point, set the color gradient through scale_fill_gradientn, and set the border and color through geom_tile.
p2=ggplot(df,aes(variable,pp,fill=value))+
geom_tile(color="gray42",fill="white",size=0.7)+
geom_point(pch=22,color="white",size=10.8)+
geom_vline(aes(xintercept =0.5), size=1.3, colour="red")+
geom_vline(aes(xintercept =10.5), size=1.3, colour="red")+
geom_hline(aes(yintercept =0.5), size=1.3, colour="red")+
geom_hline(aes(yintercept =14.5), size=1.3, colour="red")+
theme(panel.background = element_blank(),legend.position = "none")+
theme(axis.text=element_text(colour='black',size=9))+
scale_fill_gradientn(colours =colorRampPalette(c('#CCCCCC','#339966','#FF9900'))(99))+
labs(x = NULL,y = NULL,fill="")
# Patchwork
library(cowplot)
p3 <- cowplot::plot_grid(p1, p2 ,ncol= 2, rel_widths = c(1.5, 1))
p3
ggsave("results/Other_microbiome_analysis/Chart_combination01.pdf", p3, width = 13, height = 6)
cor_env_ggcorplot <- function(
env1 = env1,
env2 = env2,
label = T,
col_cluster = T,
row_cluster = T,
method = "spearman",
r.threshold=0.6,
p.threshold=0.05,
theme.size = 10
){
if (dim(env2)[2] == 1) {
env2 = env2
} else {
env2 <- env2[match(row.names(env1),row.names(env2)),]
}
env0 <- cbind(env1,env2)
occor = psych::corr.test(env0,use="pairwise",method=method,adjust="fdr",alpha=.05)
occor.r = occor$r
occor.p = occor$p
occor.r[occor.p > p.threshold&abs(occor.r) < r.threshold] = 0
head(env0)
# data[data > 0.3]<-0.3
#drop gene column as now in rows
if (col_cluster) {
clust <- hclust(dist(env1 %>% as.matrix()%>% t())) # hclust with distance matrix
ggtree_plot <- ggtree::ggtree(clust)
}
if (row_cluster) {
v_clust <- hclust(dist(env2 %>% as.matrix() %>% t()))
ggtree_plot_col <- ggtree::ggtree(v_clust) + ggtree::layout_dendrogram()
}
occor.r = as.data.frame(occor.r)
if (dim(env2)[2] == 1) {
data <- occor.r[colnames(env1),colnames(env2)]
data = data.frame(row.names = colnames(env1),data)
colnames(data) = colnames(env2)
data$id = row.names(data)
} else {
data <- occor.r[colnames(env1),colnames(env2)]
data$id = row.names(data)
}
pcm = reshape2::melt(data, id = c("id"))
head(pcm)
occor.p = as.data.frame(occor.p)
if (dim(env2)[2] == 1) {
data <- occor.p[colnames(env1),colnames(env2)]
data = data.frame(row.names = colnames(env1),data)
colnames(data) = colnames(env2)
data$id = row.names(data)
} else {
data <- occor.p[colnames(env1),colnames(env2)]
data$id = row.names(data)
}
pcm2 = reshape2::melt(data, id = c("id"))
head(pcm2)
colnames(pcm2)[3] = "p"
pcm2$lab = pcm2$p
pcm2$lab[pcm2$lab < 0.001] = "**"
pcm2$lab[pcm2$lab < 0.05] = "*"
pcm2$lab[pcm2$lab >= 0.05] = ""
pcm3 = pcm %>% left_join(pcm2)
p1 = ggplot(pcm3, aes(y = id, x = variable)) +
# geom_point(aes(size = value,fill = value), alpha = 0.75, shape = 21) +
geom_tile(aes(size = value,fill = value))+
scale_size_continuous(limits = c(0.000001, 100), range = c(2,25), breaks = c(0.1,0.5,1)) +
geom_text(aes(label = lab)) +
labs( y= "", x = "", size = "Relative Abundance (%)", fill = "") +
# scale_fill_manual(values = colours, guide = FALSE) +
scale_x_discrete(limits = rev(levels(pcm$variable))) +
scale_y_discrete(position = "right") +
scale_fill_gradientn(colours =colorRampPalette(c("#377EB8","#F7F4F9","#E41A1C"))(60)) +
theme(
panel.background=element_blank(),
panel.grid=element_blank(),
axis.text.x = element_text(colour = "black",size = theme.size,angle = 60,vjust = 1,hjust = 1)
)
p2 = ggplot(pcm3, aes(y = id, x = variable)) +
geom_point(aes(size = value,fill = value), alpha = 0.75, shape = 21) +
scale_size_continuous(limits = c(0.000001, 100), range = c(2,25), breaks = c(0.1,0.5,1)) +
geom_text(aes(label = lab)) +
labs( y= "", x = "", size = "Relative Abundance (%)", fill = "") +
# scale_fill_manual(values = colours, guide = FALSE) +
scale_x_discrete(limits = rev(levels(pcm$variable))) +
scale_y_discrete(position = "right") +
scale_fill_gradientn(colours =colorRampPalette(c("#377EB8","#F7F4F9","#E41A1C"))(60)) +
theme(
panel.background=element_blank(),
panel.grid=element_blank(),
axis.text.x = element_text(colour = "black",size = theme.size,angle = 60,vjust = 1,hjust = 1)
)
if (col_cluster) {
p1 <- p1 %>%
aplot::insert_left(ggtree_plot, width=.2)
p2 <- p2 %>%
aplot::insert_left(ggtree_plot, width=.2)
}
if (label) {
p1 <- p1 %>%
aplot::insert_top(ggtree_plot_col, height=.1)
p2 <- p2 %>%
aplot::insert_top(ggtree_plot_col, height=.1)
}
return(list(p1,p2))
}
# Load data
metadata = read.delim("./data/data.ps/metadata.tsv")
row.names(metadata) = metadata$SampleID
otutab = read.table("./data/data.ps/otutab.txt", header=T, row.names=1, sep="\t", comment.char="", stringsAsFactors = F)
taxonomy = read.table("./data/data.ps/taxonomy.txt", header=T, row.names=1, sep="\t", comment.char="", stringsAsFactors = F)
otutab2 = apply(otutab, 2, function(x) x/sum(x))
otutab2 <- as.data.frame(otutab2)
otutab2$OTU <- rownames(otutab2)
taxonomy <- as.data.frame(taxonomy)
taxonomy$OTU <- rownames(taxonomy)
otutab3 = merge(taxonomy, otutab2, by = "OTU")
rownames(otutab3) <- otutab3$OTU
otutab_phylum <- otutab3[, c(3, 9:26)]
# sum of phylum
otutab_phylum <- aggregate(.~ Phylum, data = otutab_phylum, sum)
rownames(otutab_phylum) = otutab_phylum$Phylum
otutab_phylum = otutab_phylum[, -1]
#otu = phyloseq::otu_table(psdata)
otu = otutab_phylum
#tax = phyloseq::tax_table(psdata)
Top = 10
if (dim(otu)[1] < Top) {
top10 <- otu[names(sort(rowSums(otu), decreasing = TRUE)[1:dim(otu)[1]]),]
top10 = t(top10)
} else {
top10 <- otu[names(sort(rowSums(otu), decreasing = TRUE)[1:Top]),]
top10 = t(top10)
}
head(top10)
Proteobacteria Actinobacteria Bacteroidetes Firmicutes Chloroflexi
KO1 0.659 0.255 0.0294 0.01634 0.01640
KO2 0.500 0.405 0.0262 0.02002 0.02142
KO3 0.608 0.281 0.0746 0.01508 0.00623
KO4 0.621 0.282 0.0305 0.03232 0.01819
KO5 0.738 0.172 0.0509 0.00733 0.01439
KO6 0.694 0.236 0.0233 0.01572 0.01108
Unassigned Acidobacteria Verrucomicrobia Planctomycetes Spirochaetes
KO1 0.01539 0.00369 0.002012 0.000671 0.000701
KO2 0.01678 0.00350 0.002070 0.002461 0.000447
KO3 0.00659 0.00244 0.001480 0.001739 0.001168
KO4 0.01079 0.00191 0.001008 0.000398 0.000928
KO5 0.01103 0.00179 0.000984 0.000725 0.001450
KO6 0.01110 0.00366 0.001942 0.000902 0.000684
env = read.csv("./data/dataNEW/env.csv")
head(env)
ID pH SOC TN NH4.N NO3.N AP AK CN LA Height TG RGR
1 sample1 4.45 7.96 0.70 6.86 9.45 36.2 74.2 11.37 35.6 63 0.609 0.863
2 sample2 4.55 9.12 0.89 5.78 10.71 49.5 68.5 10.24 46.8 67 0.625 0.976
3 sample3 4.47 7.58 0.92 6.27 11.85 50.8 68.9 8.24 44.6 48 0.886 1.340
4 sample4 4.63 8.66 0.65 5.49 10.22 55.7 58.7 13.28 39.7 53 0.644 0.801
5 sample5 4.38 9.59 0.74 6.55 9.84 44.4 60.8 13.04 50.7 55 0.591 0.665
6 sample6 4.52 9.88 0.63 5.66 9.73 39.9 70.4 15.74 42.5 55 0.751 1.145
LB SB RB R.S FRB LRW1 LRW2 LRW3 LRN1 LRN2 LRN3 LRL1 LRL2 LRL3 LRD1
1 48.4 275 315 0.975 91.4 35.3 29.4 26.7 19 41 96 31.7 132 131.9 0.188
2 70.1 213 380 1.340 93.1 36.1 29.5 27.5 30 47 56 34.5 145 84.2 0.126
3 61.1 253 351 1.117 89.7 30.0 31.2 28.5 26 41 110 26.4 108 181.3 0.179
4 74.7 220 309 1.048 96.9 38.5 30.6 27.9 22 33 84 25.3 153 156.6 0.208
5 58.2 309 395 1.078 89.4 32.6 29.8 26.9 28 51 103 28.6 132 77.3 0.135
6 70.4 255 356 1.095 92.3 33.6 30.3 28.4 25 42 131 26.8 220 211.1 0.241
LRD2 LRD3 MaxO TLRN TLRL BI MID
1 0.0655 0.058 4 156 296 1.54 0.619
2 0.0700 0.069 5 133 263 1.80 1.780
3 0.0660 0.066 6 177 316 2.67 1.373
4 0.0940 0.086 5 139 335 1.37 0.811
5 0.0680 0.071 7 182 238 1.88 0.700
6 0.0700 0.068 4 198 457 1.73 0.602
envRDA = env
head(env)
ID pH SOC TN NH4.N NO3.N AP AK CN LA Height TG RGR
1 sample1 4.45 7.96 0.70 6.86 9.45 36.2 74.2 11.37 35.6 63 0.609 0.863
2 sample2 4.55 9.12 0.89 5.78 10.71 49.5 68.5 10.24 46.8 67 0.625 0.976
3 sample3 4.47 7.58 0.92 6.27 11.85 50.8 68.9 8.24 44.6 48 0.886 1.340
4 sample4 4.63 8.66 0.65 5.49 10.22 55.7 58.7 13.28 39.7 53 0.644 0.801
5 sample5 4.38 9.59 0.74 6.55 9.84 44.4 60.8 13.04 50.7 55 0.591 0.665
6 sample6 4.52 9.88 0.63 5.66 9.73 39.9 70.4 15.74 42.5 55 0.751 1.145
LB SB RB R.S FRB LRW1 LRW2 LRW3 LRN1 LRN2 LRN3 LRL1 LRL2 LRL3 LRD1
1 48.4 275 315 0.975 91.4 35.3 29.4 26.7 19 41 96 31.7 132 131.9 0.188
2 70.1 213 380 1.340 93.1 36.1 29.5 27.5 30 47 56 34.5 145 84.2 0.126
3 61.1 253 351 1.117 89.7 30.0 31.2 28.5 26 41 110 26.4 108 181.3 0.179
4 74.7 220 309 1.048 96.9 38.5 30.6 27.9 22 33 84 25.3 153 156.6 0.208
5 58.2 309 395 1.078 89.4 32.6 29.8 26.9 28 51 103 28.6 132 77.3 0.135
6 70.4 255 356 1.095 92.3 33.6 30.3 28.4 25 42 131 26.8 220 211.1 0.241
LRD2 LRD3 MaxO TLRN TLRL BI MID
1 0.0655 0.058 4 156 296 1.54 0.619
2 0.0700 0.069 5 133 263 1.80 1.780
3 0.0660 0.066 6 177 316 2.67 1.373
4 0.0940 0.086 5 139 335 1.37 0.811
5 0.0680 0.071 7 182 238 1.88 0.700
6 0.0700 0.068 4 198 457 1.73 0.602
row.names(envRDA) = env$ID
envRDA$ID = NULL
head(envRDA)
pH SOC TN NH4.N NO3.N AP AK CN LA Height TG RGR LB
sample1 4.45 7.96 0.70 6.86 9.45 36.2 74.2 11.37 35.6 63 0.609 0.863 48.4
sample2 4.55 9.12 0.89 5.78 10.71 49.5 68.5 10.24 46.8 67 0.625 0.976 70.1
sample3 4.47 7.58 0.92 6.27 11.85 50.8 68.9 8.24 44.6 48 0.886 1.340 61.1
sample4 4.63 8.66 0.65 5.49 10.22 55.7 58.7 13.28 39.7 53 0.644 0.801 74.7
sample5 4.38 9.59 0.74 6.55 9.84 44.4 60.8 13.04 50.7 55 0.591 0.665 58.2
sample6 4.52 9.88 0.63 5.66 9.73 39.9 70.4 15.74 42.5 55 0.751 1.145 70.4
SB RB R.S FRB LRW1 LRW2 LRW3 LRN1 LRN2 LRN3 LRL1 LRL2 LRL3 LRD1
sample1 275 315 0.975 91.4 35.3 29.4 26.7 19 41 96 31.7 132 131.9 0.188
sample2 213 380 1.340 93.1 36.1 29.5 27.5 30 47 56 34.5 145 84.2 0.126
sample3 253 351 1.117 89.7 30.0 31.2 28.5 26 41 110 26.4 108 181.3 0.179
sample4 220 309 1.048 96.9 38.5 30.6 27.9 22 33 84 25.3 153 156.6 0.208
sample5 309 395 1.078 89.4 32.6 29.8 26.9 28 51 103 28.6 132 77.3 0.135
sample6 255 356 1.095 92.3 33.6 30.3 28.4 25 42 131 26.8 220 211.1 0.241
LRD2 LRD3 MaxO TLRN TLRL BI MID
sample1 0.0655 0.058 4 156 296 1.54 0.619
sample2 0.0700 0.069 5 133 263 1.80 1.780
sample3 0.0660 0.066 6 177 316 2.67 1.373
sample4 0.0940 0.086 5 139 335 1.37 0.811
sample5 0.0680 0.071 7 182 238 1.88 0.700
sample6 0.0700 0.068 4 198 457 1.73 0.602
env1 = envRDA
env2 = top10
env2 = as.data.frame(env2)
env2$sample <- rownames(env1)
rownames(env2) <- env2$sample
env2 <- env2[, -11]
result = cor_env_ggcorplot(
env1 = env1,
env2 = env2,
label = TRUE,
col_cluster = TRUE,
row_cluster = TRUE,
method = "spearman",
r.threshold= 0,
p.threshold= 0
)
p1 <- result[[1]]
p1
p2 <- result[[2]]
p2
hei = dim(env)[2]/5
wid = Top
filename = paste("results/Other_microbiome_analysis/Top10_Phylum_abundacne_OTU.csv",sep = "")
write.csv(top10,filename)
filename = paste("results/Other_microbiome_analysis/ggheatmap.pdf",sep = "")
ggsave(filename,p1,width = Top/2,height = dim(env)[2]/5)
filename = paste("results/Other_microbiome_analysis/ggbubble.pdf",sep = "")
ggsave(filename,p2,width = Top/2,height = dim(env)[2]/5)
filename = paste("results/Other_microbiome_analysis/ggheatmap2.jpg",sep = "")
ggsave(filename,p1,width = Top/2,height = dim(env)[2]/5)
filename = paste("results/Other_microbiome_analysis/ggbubble2.jpg",sep = "")
ggsave(filename,p2,width = Top/2,height = dim(env)[2]/5)
# Load data
metadata = read.delim("./data/data.ps/metadata.tsv")
row.names(metadata) = metadata$SampleID
otutab = read.table("./data/data.ps/otutab.txt", header=T, row.names=1, sep="\t", comment.char="", stringsAsFactors = F)
taxonomy = read.table("./data/data.ps/taxonomy.txt", header=T, row.names=1, sep="\t", comment.char="", stringsAsFactors = F)
otutab2 = apply(otutab, 2, function(x) x/sum(x))
otutab2 <- as.data.frame(otutab2)
otutab2$OTU <- rownames(otutab2)
taxonomy <- as.data.frame(taxonomy)
taxonomy$OTU <- rownames(taxonomy)
otutab3 = merge(taxonomy, otutab2, by = "OTU")
rownames(otutab3) <- otutab3$OTU
otutab_phylum <- otutab3[, c(3, 9:26)]
# sum of phylum
otutab_phylum <- aggregate(.~ Phylum, data = otutab_phylum, sum)
rownames(otutab_phylum) = otutab_phylum$Phylum
otutab_phylum = otutab_phylum[, -1]
#otu = phyloseq::otu_table(psdata)
otu = otutab_phylum
#tax = phyloseq::tax_table(psdata)
tran = TRUE
Top = 10
rowSD = function(x){
apply(x,1, sd)
}
rowCV = function(x){
rowSD(x)/rowMeans(x)
}
id <- otu %>%
as.data.frame() %>%
rowCV %>%
sort(decreasing = TRUE) %>%
head(Top) %>%
names()
data = otu[id,] %>% t() %>%
as.data.frame()
env = read.csv("./data/dataNEW/env.csv")
head(env)
ID pH SOC TN NH4.N NO3.N AP AK CN LA Height TG RGR
1 sample1 4.45 7.96 0.70 6.86 9.45 36.2 74.2 11.37 35.6 63 0.609 0.863
2 sample2 4.55 9.12 0.89 5.78 10.71 49.5 68.5 10.24 46.8 67 0.625 0.976
3 sample3 4.47 7.58 0.92 6.27 11.85 50.8 68.9 8.24 44.6 48 0.886 1.340
4 sample4 4.63 8.66 0.65 5.49 10.22 55.7 58.7 13.28 39.7 53 0.644 0.801
5 sample5 4.38 9.59 0.74 6.55 9.84 44.4 60.8 13.04 50.7 55 0.591 0.665
6 sample6 4.52 9.88 0.63 5.66 9.73 39.9 70.4 15.74 42.5 55 0.751 1.145
LB SB RB R.S FRB LRW1 LRW2 LRW3 LRN1 LRN2 LRN3 LRL1 LRL2 LRL3 LRD1
1 48.4 275 315 0.975 91.4 35.3 29.4 26.7 19 41 96 31.7 132 131.9 0.188
2 70.1 213 380 1.340 93.1 36.1 29.5 27.5 30 47 56 34.5 145 84.2 0.126
3 61.1 253 351 1.117 89.7 30.0 31.2 28.5 26 41 110 26.4 108 181.3 0.179
4 74.7 220 309 1.048 96.9 38.5 30.6 27.9 22 33 84 25.3 153 156.6 0.208
5 58.2 309 395 1.078 89.4 32.6 29.8 26.9 28 51 103 28.6 132 77.3 0.135
6 70.4 255 356 1.095 92.3 33.6 30.3 28.4 25 42 131 26.8 220 211.1 0.241
LRD2 LRD3 MaxO TLRN TLRL BI MID
1 0.0655 0.058 4 156 296 1.54 0.619
2 0.0700 0.069 5 133 263 1.80 1.780
3 0.0660 0.066 6 177 316 2.67 1.373
4 0.0940 0.086 5 139 335 1.37 0.811
5 0.0680 0.071 7 182 238 1.88 0.700
6 0.0700 0.068 4 198 457 1.73 0.602
envRDA = env
head(env)
ID pH SOC TN NH4.N NO3.N AP AK CN LA Height TG RGR
1 sample1 4.45 7.96 0.70 6.86 9.45 36.2 74.2 11.37 35.6 63 0.609 0.863
2 sample2 4.55 9.12 0.89 5.78 10.71 49.5 68.5 10.24 46.8 67 0.625 0.976
3 sample3 4.47 7.58 0.92 6.27 11.85 50.8 68.9 8.24 44.6 48 0.886 1.340
4 sample4 4.63 8.66 0.65 5.49 10.22 55.7 58.7 13.28 39.7 53 0.644 0.801
5 sample5 4.38 9.59 0.74 6.55 9.84 44.4 60.8 13.04 50.7 55 0.591 0.665
6 sample6 4.52 9.88 0.63 5.66 9.73 39.9 70.4 15.74 42.5 55 0.751 1.145
LB SB RB R.S FRB LRW1 LRW2 LRW3 LRN1 LRN2 LRN3 LRL1 LRL2 LRL3 LRD1
1 48.4 275 315 0.975 91.4 35.3 29.4 26.7 19 41 96 31.7 132 131.9 0.188
2 70.1 213 380 1.340 93.1 36.1 29.5 27.5 30 47 56 34.5 145 84.2 0.126
3 61.1 253 351 1.117 89.7 30.0 31.2 28.5 26 41 110 26.4 108 181.3 0.179
4 74.7 220 309 1.048 96.9 38.5 30.6 27.9 22 33 84 25.3 153 156.6 0.208
5 58.2 309 395 1.078 89.4 32.6 29.8 26.9 28 51 103 28.6 132 77.3 0.135
6 70.4 255 356 1.095 92.3 33.6 30.3 28.4 25 42 131 26.8 220 211.1 0.241
LRD2 LRD3 MaxO TLRN TLRL BI MID
1 0.0655 0.058 4 156 296 1.54 0.619
2 0.0700 0.069 5 133 263 1.80 1.780
3 0.0660 0.066 6 177 316 2.67 1.373
4 0.0940 0.086 5 139 335 1.37 0.811
5 0.0680 0.071 7 182 238 1.88 0.700
6 0.0700 0.068 4 198 457 1.73 0.602
row.names(envRDA) = env$ID
envRDA$ID = NULL
head(envRDA)
pH SOC TN NH4.N NO3.N AP AK CN LA Height TG RGR LB
sample1 4.45 7.96 0.70 6.86 9.45 36.2 74.2 11.37 35.6 63 0.609 0.863 48.4
sample2 4.55 9.12 0.89 5.78 10.71 49.5 68.5 10.24 46.8 67 0.625 0.976 70.1
sample3 4.47 7.58 0.92 6.27 11.85 50.8 68.9 8.24 44.6 48 0.886 1.340 61.1
sample4 4.63 8.66 0.65 5.49 10.22 55.7 58.7 13.28 39.7 53 0.644 0.801 74.7
sample5 4.38 9.59 0.74 6.55 9.84 44.4 60.8 13.04 50.7 55 0.591 0.665 58.2
sample6 4.52 9.88 0.63 5.66 9.73 39.9 70.4 15.74 42.5 55 0.751 1.145 70.4
SB RB R.S FRB LRW1 LRW2 LRW3 LRN1 LRN2 LRN3 LRL1 LRL2 LRL3 LRD1
sample1 275 315 0.975 91.4 35.3 29.4 26.7 19 41 96 31.7 132 131.9 0.188
sample2 213 380 1.340 93.1 36.1 29.5 27.5 30 47 56 34.5 145 84.2 0.126
sample3 253 351 1.117 89.7 30.0 31.2 28.5 26 41 110 26.4 108 181.3 0.179
sample4 220 309 1.048 96.9 38.5 30.6 27.9 22 33 84 25.3 153 156.6 0.208
sample5 309 395 1.078 89.4 32.6 29.8 26.9 28 51 103 28.6 132 77.3 0.135
sample6 255 356 1.095 92.3 33.6 30.3 28.4 25 42 131 26.8 220 211.1 0.241
LRD2 LRD3 MaxO TLRN TLRL BI MID
sample1 0.0655 0.058 4 156 296 1.54 0.619
sample2 0.0700 0.069 5 133 263 1.80 1.780
sample3 0.0660 0.066 6 177 316 2.67 1.373
sample4 0.0940 0.086 5 139 335 1.37 0.811
sample5 0.0680 0.071 7 182 238 1.88 0.700
sample6 0.0700 0.068 4 198 457 1.73 0.602
env1 = envRDA
env2 = data
env2 = as.data.frame(env2)
env2$sample <- rownames(env1)
rownames(env2) <- env2$sample
env2 <- env2[, -11]
result = cor_env_ggcorplot(
env1 = env1,
env2 = env2,
label = TRUE,
col_cluster = TRUE,
row_cluster = TRUE,
method = "spearman",
r.threshold= 0,
p.threshold= 0
)
p1 <- result[[1]]
p1
p2 <- result[[2]]
p2
hei = dim(env)[2]/5
wid = Top
filename = paste("results/Other_microbiome_analysis/Top10_Phylum_abundacne_OTU2.csv",sep = "")
write.csv(data,filename)
filename = paste("results/Other_microbiome_analysis/ggheatmap3.pdf",sep = "")
ggsave(filename,p1,width = Top/2,height = dim(env)[2]/5)
filename = paste("results/Other_microbiome_analysis/ggbubble3.pdf",sep = "")
ggsave(filename,p2,width = Top/2,height = dim(env)[2]/5)
filename = paste("results/Other_microbiome_analysis/ggheatmap4.jpg",sep = "")
ggsave(filename,p1,width = Top/2,height = dim(env)[2]/5)
filename = paste("results/Other_microbiome_analysis/ggbubble4.jpg",sep = "")
ggsave(filename,p2,width = Top/2,height = dim(env)[2]/5)
library(phyloseq)
library(ggClusterNet)
library(tidyverse)
conflicts_prefer(ggClusterNet::cor)
env = read.csv("./data/dataNEW/env.csv")
head(env)
ID pH SOC TN NH4.N NO3.N AP AK CN LA Height TG RGR
1 sample1 4.45 7.96 0.70 6.86 9.45 36.2 74.2 11.37 35.6 63 0.609 0.863
2 sample2 4.55 9.12 0.89 5.78 10.71 49.5 68.5 10.24 46.8 67 0.625 0.976
3 sample3 4.47 7.58 0.92 6.27 11.85 50.8 68.9 8.24 44.6 48 0.886 1.340
4 sample4 4.63 8.66 0.65 5.49 10.22 55.7 58.7 13.28 39.7 53 0.644 0.801
5 sample5 4.38 9.59 0.74 6.55 9.84 44.4 60.8 13.04 50.7 55 0.591 0.665
6 sample6 4.52 9.88 0.63 5.66 9.73 39.9 70.4 15.74 42.5 55 0.751 1.145
LB SB RB R.S FRB LRW1 LRW2 LRW3 LRN1 LRN2 LRN3 LRL1 LRL2 LRL3 LRD1
1 48.4 275 315 0.975 91.4 35.3 29.4 26.7 19 41 96 31.7 132 131.9 0.188
2 70.1 213 380 1.340 93.1 36.1 29.5 27.5 30 47 56 34.5 145 84.2 0.126
3 61.1 253 351 1.117 89.7 30.0 31.2 28.5 26 41 110 26.4 108 181.3 0.179
4 74.7 220 309 1.048 96.9 38.5 30.6 27.9 22 33 84 25.3 153 156.6 0.208
5 58.2 309 395 1.078 89.4 32.6 29.8 26.9 28 51 103 28.6 132 77.3 0.135
6 70.4 255 356 1.095 92.3 33.6 30.3 28.4 25 42 131 26.8 220 211.1 0.241
LRD2 LRD3 MaxO TLRN TLRL BI MID
1 0.0655 0.058 4 156 296 1.54 0.619
2 0.0700 0.069 5 133 263 1.80 1.780
3 0.0660 0.066 6 177 316 2.67 1.373
4 0.0940 0.086 5 139 335 1.37 0.811
5 0.0680 0.071 7 182 238 1.88 0.700
6 0.0700 0.068 4 198 457 1.73 0.602
envRDA = env
head(env)
ID pH SOC TN NH4.N NO3.N AP AK CN LA Height TG RGR
1 sample1 4.45 7.96 0.70 6.86 9.45 36.2 74.2 11.37 35.6 63 0.609 0.863
2 sample2 4.55 9.12 0.89 5.78 10.71 49.5 68.5 10.24 46.8 67 0.625 0.976
3 sample3 4.47 7.58 0.92 6.27 11.85 50.8 68.9 8.24 44.6 48 0.886 1.340
4 sample4 4.63 8.66 0.65 5.49 10.22 55.7 58.7 13.28 39.7 53 0.644 0.801
5 sample5 4.38 9.59 0.74 6.55 9.84 44.4 60.8 13.04 50.7 55 0.591 0.665
6 sample6 4.52 9.88 0.63 5.66 9.73 39.9 70.4 15.74 42.5 55 0.751 1.145
LB SB RB R.S FRB LRW1 LRW2 LRW3 LRN1 LRN2 LRN3 LRL1 LRL2 LRL3 LRD1
1 48.4 275 315 0.975 91.4 35.3 29.4 26.7 19 41 96 31.7 132 131.9 0.188
2 70.1 213 380 1.340 93.1 36.1 29.5 27.5 30 47 56 34.5 145 84.2 0.126
3 61.1 253 351 1.117 89.7 30.0 31.2 28.5 26 41 110 26.4 108 181.3 0.179
4 74.7 220 309 1.048 96.9 38.5 30.6 27.9 22 33 84 25.3 153 156.6 0.208
5 58.2 309 395 1.078 89.4 32.6 29.8 26.9 28 51 103 28.6 132 77.3 0.135
6 70.4 255 356 1.095 92.3 33.6 30.3 28.4 25 42 131 26.8 220 211.1 0.241
LRD2 LRD3 MaxO TLRN TLRL BI MID
1 0.0655 0.058 4 156 296 1.54 0.619
2 0.0700 0.069 5 133 263 1.80 1.780
3 0.0660 0.066 6 177 316 2.67 1.373
4 0.0940 0.086 5 139 335 1.37 0.811
5 0.0680 0.071 7 182 238 1.88 0.700
6 0.0700 0.068 4 198 457 1.73 0.602
row.names(envRDA) = env$ID
envRDA$ID = NULL
head(envRDA)
pH SOC TN NH4.N NO3.N AP AK CN LA Height TG RGR LB
sample1 4.45 7.96 0.70 6.86 9.45 36.2 74.2 11.37 35.6 63 0.609 0.863 48.4
sample2 4.55 9.12 0.89 5.78 10.71 49.5 68.5 10.24 46.8 67 0.625 0.976 70.1
sample3 4.47 7.58 0.92 6.27 11.85 50.8 68.9 8.24 44.6 48 0.886 1.340 61.1
sample4 4.63 8.66 0.65 5.49 10.22 55.7 58.7 13.28 39.7 53 0.644 0.801 74.7
sample5 4.38 9.59 0.74 6.55 9.84 44.4 60.8 13.04 50.7 55 0.591 0.665 58.2
sample6 4.52 9.88 0.63 5.66 9.73 39.9 70.4 15.74 42.5 55 0.751 1.145 70.4
SB RB R.S FRB LRW1 LRW2 LRW3 LRN1 LRN2 LRN3 LRL1 LRL2 LRL3 LRD1
sample1 275 315 0.975 91.4 35.3 29.4 26.7 19 41 96 31.7 132 131.9 0.188
sample2 213 380 1.340 93.1 36.1 29.5 27.5 30 47 56 34.5 145 84.2 0.126
sample3 253 351 1.117 89.7 30.0 31.2 28.5 26 41 110 26.4 108 181.3 0.179
sample4 220 309 1.048 96.9 38.5 30.6 27.9 22 33 84 25.3 153 156.6 0.208
sample5 309 395 1.078 89.4 32.6 29.8 26.9 28 51 103 28.6 132 77.3 0.135
sample6 255 356 1.095 92.3 33.6 30.3 28.4 25 42 131 26.8 220 211.1 0.241
LRD2 LRD3 MaxO TLRN TLRL BI MID
sample1 0.0655 0.058 4 156 296 1.54 0.619
sample2 0.0700 0.069 5 133 263 1.80 1.780
sample3 0.0660 0.066 6 177 316 2.67 1.373
sample4 0.0940 0.086 5 139 335 1.37 0.811
sample5 0.0680 0.071 7 182 238 1.88 0.700
sample6 0.0700 0.068 4 198 457 1.73 0.602
ps = readRDS("./data/dataNEW/ps_16s.rds")
id = sample_data(ps)$Group %>% unique()
id
[1] "Group1" "Group2" "Group3"
i = 1
res1path <- "results/Other_microbiome_analysis/"
for (i in 1:length(id)) {
netpath = paste(res1path,"network_env_hub_",id[i],"/",sep = "")
dir.create(netpath)
# ps.1 = phyloseq::subset_samples(
# ps,Group %in% c(id[i])
# )
ps.1 = ps %>% scale_micro("TMM") %>%
subset_samples(
Group %in% c(id[i])
)
library(ggClusterNet)
library(igraph)
#--计算微生物网络相关矩阵(Calculation of microbial network correlation matrix)
result= ggClusterNet::cor_Big_micro(ps = ps.1,
N = 500,
p.threshold = 0.05,
r.threshold = 0.8,
scale = FALSE)
cor = result[[1]]
#--拟合模块(Fitting module)
tem <- model_maptree(cor =result[[1]],
method = "cluster_fast_greedy",
seed = 12
)
node_model = tem[[2]]
head(node_model)
otu = ps.1 %>%
phyloseq::subset_taxa(
row.names(tax_table(ps ))%in%c(row.names(result[[1]]))) %>%
vegan_otu() %>%
as.data.frame()
#-对其
node_model = node_model[match(colnames(otu),node_model$ID),]
MEList = WGCNA::moduleEigengenes(otu, colors = node_model$group)
MEs = MEList$eigengenes
tablename <- paste(netpath,"./model_network_feature_value_",id[i],".csv",sep = "")
write.csv(MEs,tablename)
#--寻找对于某个环境因子作用最大的模块
#--Find the module that has the greatest effect on a certain environmental factor
ramdom_Model_env.plot <- function(
model = model,
sink = env,
seed = 1
){
model$ID = row.names(model)
set.seed(seed)
tem.r = sink %>% inner_join(model,by = "ID") %>%
select(-ID)
frichness.rfP <- rfPermute::rfPermute(tem.r[[1]] ~., data=tem.r[,-1],
ntree = 500, na.action = na.omit, nrep = 100, num.cores = 1)
frimp.scaled1 <- rfPermute::importance(frichness.rfP, scale = TRUE)%>% round(3)
frimp.scaled1 <- frimp.scaled1[,1:2]
df<-cbind(as.data.frame(frimp.scaled1),rownames(frimp.scaled1))
head(df)
df$`%IncMSE.pval`[is.na(df$`%IncMSE.pval`)] = 1
i = 1
a = c()
for (i in 1:length(df$`%IncMSE.pval`)) {
if (df$`%IncMSE.pval`[i] > 0.05) {
a[i] = ""
} else {
a[i] = "*"
}
}
df$lab = a
df$`%IncMSE`[is.na(df$`%IncMSE`)] = 0
p <- ggplot(df, aes(x =`%IncMSE` , y =reorder(`rownames(frimp.scaled1)`,`%IncMSE`) )) +
geom_bar(stat = "identity", width = 0.75,position = "dodge",colour="black",fill="#9ACD32",alpha=1) +
geom_text(aes(label = lab),hjust = -1) +
labs(y="Model in network", x="%IncMSE", title = colnames(sink)[2],size=9)+
theme_bw() +
theme(axis.text=element_text(colour='black',size=9))
return(list(df,p))
}
env.1 = env %>% filter(ID %in% sample_names(ps.1))
result <- ramdom_Model_env.plot(model = MEs,
sink =env.1[,1:2] )
p <- result[[2]]
p
data = result[[1]]
head(data)
hit <- dim(MEs)[2]
hit
FileName <- paste(netpath,"./ranImportant_Model_order_", id[i],".pdf", sep = "")
ggsave(FileName, p,width = 6,height =hit/5)
FileName <- paste(netpath,"./ranImportant_Model_order", id[i], ".csv", sep = "")
write.csv(data,FileName)
nGenes = ncol(otu)
nSamples = nrow(otu)
moduleTraitCor = cor(MEs, envRDA[sample_names(ps.1),], use = "p")
moduleTraitPvalue = WGCNA::corPvalueStudent(moduleTraitCor, nSamples)
#sizeGrWindow(10,6)
# dim(MEs)[2]/2
# dim(envRDA)[2]/2
pdf(file=paste(netpath,"./","Module-env_relationships.pdf",sep = ""),width=dim(envRDA)[2]/2,height=dim(MEs)[2]/2)
# Will display correlations and their p-values
textMatrix = paste(signif(moduleTraitCor, 2), "\n(",
signif(moduleTraitPvalue, 1), ")", sep = "")
dim(textMatrix) = dim(moduleTraitCor)
par(mar = c(6, 8.5, 3, 3))
# Display the correlation values within a heatmap plot
WGCNA::labeledHeatmap(Matrix = moduleTraitCor,
xLabels = names(envRDA),
yLabels = names(MEs),
ySymbols = names(MEs),
colorLabels = FALSE,
colors = WGCNA::greenWhiteRed(50),
textMatrix = textMatrix,
setStdMargins = FALSE,
cex.text = 0.5,
zlim = c(-1,1),
main = paste("Module-trait relationships"))
dev.off()
#关键微生物与理化关系(Key microorganisms and physicochemical relationships)#--
# NetmodelEnv = paste(res1path,"/HUB_micro_env/",sep = "")
# dir.create(NetmodelEnv)
igraph = make_igraph(cor)
tem = 10
hub = hub_score(igraph)$vector %>%
sort(decreasing = TRUE) %>%
head(tem) %>%
as.data.frame()
colnames(hub) = "hub_sca"
p = ggplot(hub) +
geom_bar(aes(x = hub_sca,y = reorder(row.names(hub),hub_sca)),stat = "identity",fill = "#4DAF4A")
p
FileName <- paste(netpath,"./hub_micro", ".pdf", sep = "")
ggsave(FileName, p,width = 6,height =tem/2)
FileName <- paste(netpath,"./hub_micro", ".csv", sep = "")
write.csv(hub,FileName)
id.2 = row.names(hub)
# ps.1 = ps %>% scale_micro("TMM") %>%
otu = phyloseq::otu_table(ps.1)
tax = phyloseq::tax_table(ps.1)
head(otu)
data = otu[id.2,] %>% t() %>%
as.data.frame()
result = cor_env_ggcorplot(
env1 = envRDA[sample_names(ps.1),],
env2 = data,
label = F,
col_cluster = F,
row_cluster = F,
method = "spearman",
r.threshold= 0.5,
p.threshold= 0
)
p1 <- result[[1]]
p1
p2 <- result[[2]]
p2
hei = dim(env)[2]/5
#
# filename = paste(NetmodelEnv,"hum_env.csv",sep = "")
# write.csv(top10,filename)
filename = paste(netpath,"hum_env.pdf",sep = "")
ggsave(filename,p1,width = tem/2,height = dim(env)[2]/5)
filename = paste(netpath,"hum_env.pdf",sep = "")
ggsave(filename,p2,width = tem/2,height = dim(env)[2]/5)
filename = paste(netpath,"hum_env.jpg",sep = "")
ggsave(filename,p1,width = tem/2,height = dim(env)[2]/5)
filename = paste(netpath,"hum_env.jpg",sep = "")
ggsave(filename,p2,width = tem/2,height = dim(env)[2]/5)
}
res1path <- "results/Other_microbiome_analysis/"
Envnetplot<- paste(res1path,"./Env_network",sep = "")
dir.create(Envnetplot)
ps16s = readRDS("./data/dataNEW/ps_16s.rds")%>% ggClusterNet::scale_micro()
psITS = NULL
library(phyloseq)
conflicted::conflicts_prefer(ggplot2::theme_void)
#--细菌和真菌ps对象中的map文件要一样(The map files in the bacteria and fungi PS objects must be the same)
ps.merge <- ggClusterNet::merge16S_ITS(ps16s = ps16s,
psITS= psITS,
NITS = 200,
N16s = 200)
map = phyloseq::sample_data(ps.merge)
# head(map)
map$Group = "one"
phyloseq::sample_data(ps.merge) <- map
#--环境因子导入(Environmental factors introduction)
data1 = env
envRDA.s = vegan::decostand(envRDA,"hellinger")
data1[,-1] = envRDA.s
Gru = data.frame(ID = colnames(env)[-1],group = "env" )
head(Gru)
ID group
1 pH env
2 SOC env
3 TN env
4 NH4.N env
5 NO3.N env
6 AP env
library(sna)
library(ggClusterNet)
library(igraph)
result <- ggClusterNet::corBionetwork(ps = ps.merge,
N = 0,
r.threshold = 0.6,
p.threshold = 0.05,
big = TRUE,
group = "Group",
env = data1,
envGroup = Gru,
layout_net = "model_maptree2",
path = Envnetplot,
fill = "Phylum",
size = "igraph.degree",
scale = TRUE,
bio = TRUE,
zipi = FALSE,
step = 100,
width = 18,
label = TRUE,
height = 10
)
[1] "one"
num [1:34, 1:18] 0.0474 0.0633 0.0188 0.0588 0.069 ...
- attr(*, "dimnames")=List of 2
..$ : chr [1:34] "pH" "SOC" "TN" "NH4.N" ...
..$ : chr [1:18] "sample1" "sample10" "sample11" "sample12" ...
[1] "1"
[1] "2"
[1] "3"
p = result[[1]]
p
# 全部样本网络参数比对(Comparison of all sample network parameters)
data = result[[2]]
plotname1 = paste(Envnetplot,"/network_all.jpg",sep = "")
ggsave(plotname1, p,width = 15,height = 12,dpi = 72)
plotname1 = paste(Envnetplot,"/network_all.png",sep = "")
ggsave(plotname1, p,width = 10,height = 8,dpi = 72)
plotname1 = paste(Envnetplot,"/network_all.pdf",sep = "")
ggsave(plotname1, p,width = 15,height = 12)
tablename <- paste(Envnetplot,"/co-occurrence_Grobel_net",".csv",sep = "")
write.csv(data,tablename)
net.property_env = function(
ps = ps,
corpath = corpath,
Top = 500,
r.threshold= 0.8,
p.threshold=0.05,
env = env,
select.mod = c("model_1","model_2","model_3"),
select.env = "pH"
){
map = sample_data(ps)
id3 = map$Group %>% unique()
for (m in 1:length(id3)) {
#---全部模块的微生物网络#——------
pst = ps %>%
filter_taxa(function(x) sum(x ) > 0, TRUE) %>%
scale_micro("rela") %>%
subset_samples.wt("Group",id3[m]) %>%
filter_OTU_ps(Top)
result = cor_Big_micro(ps = pst,
N = 0,
r.threshold= r.threshold,
p.threshold= p.threshold,
method = "spearman")
cor = result[[1]]
head(cor)
# igraph = make_igraph(cor)
#--计算模块信息,部分OTU没有模块,注意去除
#--Calculation module information, some OTUs have no modules, please remove them
# netClu = modulGroup( cor = cor,cut = NULL,method = "cluster_fast_greedy" )
# head(netClu)
# result2 = model_maptree_group(cor = cor,
# nodeGroup = netClu,
# )
result2 = model_maptree2(cor = cor, method = "cluster_fast_greedy")
#----多功能性和环境因子等和模块相关(Multifunctionality and environmental factors are related to modules)#-----
# 第一种是模块特征向量(The first is the module feature vector)
#--基于模块的OTU,计算在不同分组中的总丰度zscore 并统计检验#-------
# Based on the module's OTU, calculate the total abundance zscore in different groups and perform statistical tests
select.mod = select.mod
mod1 = result2[[2]]
head(mod1)
tem = mod1$group %>% table() %>%
as.data.frame() %>%
dplyr::arrange(desc(Freq))
colnames(tem) = c("Model","OTU.num")
head(tem)
if (length(select.mod) == 1 & is.numeric(select.mod)) {
select.mod.name = tem$Model[1:select.mod]
mod1 = mod1 %>% filter(!group == "mother_no",
group %in%c(select.mod.name)
) %>% select(ID,group,degree)
} else if (is.character(select.mod)) {
select.mod.name = select.mod
mod1 = mod1 %>% filter(!group == "mother_no",
group %in%c(select.mod.name)
) %>% select(ID,group,degree)
}
id.s = mod1$group %>% unique()
for (i in 1:length(id.s)) {
id.t = mod1 %>%
dplyr::filter(group %in% id.s[i]) %>%
.$ID
ps.t = ps %>%
scale_micro() %>%
subset_taxa.wt("OTU", id.t )
otu = ps.t %>%
vegan_otu() %>%
t()
colSD = function(x){
apply(x,2, sd)
}
dat = (otu - colMeans(otu))/colSD(otu)
head(dat)
otu_table(ps.t) = otu_table(as.matrix(dat),taxa_are_rows = TRUE)
#--计算总丰度(Calculate total abundance)
otu = ps.t %>% vegan_otu() %>% t()
colSums(otu)
dat = data.frame(id = names(colSums(otu)),abundance.zscore = colSums(otu))
colnames(dat)[2] = id.s[i]
if (i ==1) {
tem = dat
} else{
dat$id = NULL
tem = cbind(tem,dat)
}
}
head(tem)
map =sample_data(ps.t)
map$id = row.names(map)
map = map[,c("id","Group")]
data = map %>%
as.tibble() %>%
inner_join(tem,by = "id") %>%
dplyr::rename(group = Group)
colnames(env)[1] = "id"
# env1$id = row.names(env1)
subenv = env %>% dplyr::select(id,everything()) %>% select(id,select.env )
head(data)
tab = data %>% left_join(subenv,by = "id")
head(tab)
library(reshape2)
mtcars2 = melt(tab, id.vars=c(select.env,"group","id"))
mtcars2$variable
head(mtcars2)
lab = mean(mtcars2[,select.env])
p1_1 = ggplot2::ggplot(mtcars2,aes(x= value,!!sym(select.env), colour=variable)) +
ggplot2::geom_point() +
ggpubr::stat_cor(label.y=lab*1.1)+
ggpubr::stat_regline_equation(label.y=lab*1.1,vjust = 2) +
facet_wrap(~variable, scales="free_x") +
geom_smooth(aes(value,!!sym(select.env), colour=variable), method=lm, se=T)+
theme_classic()
p1_1
filename = paste(corpath,"/cor_netowrkmodel",select.env,".pdf",sep = "")
ggsave(filename,p1_1,width = 16,height = 18)
#-多功能性和网络属性相关(Versatility and network properties)#---------
igraph = make_igraph(cor)
dat = igraph::V(igraph)
names(dat) %>% length()
#--弄清楚每个样本包含的OTU数量
#--Find out the number of OTUs contained in each sample
# pst = ps %>%
# scale_micro("rela") %>%
# phyloseq::subset_samples(Group %in% c("KO","WT","OE")) %>%
# filter_OTU_ps(500)
print("1")
# otu = pst %>%
# # phyloseq::subset_samples(Group %in% c("KO","WT","OE")) %>%
# # filter_OTU_ps(500) %>%
# subset_taxa(row.names(tax_table(pst)) %in% names(dat)) %>%
# vegan_otu() %>%
# t()
# print("1")
otu = pst %>% vegan_otu() %>% t()
otu = otu[row.names(otu) %in% names(dat),]
otu[otu > 1] = 1
dim(otu)
A = list()
dat.f = NULL
for (i in 1:length(colnames(otu))) {
tem = otu[,colnames(otu)[i]][otu[,colnames(otu)[i]] > 0 ] %>% names()
A[[colnames(otu)[i]]] = tem
#-计算性质(Computational properties)
tem.2 = A[[colnames(otu)[i]]]
tem.g = igraph::induced_subgraph(igraph,tem.2)
dat = net_properties.2(tem.g,n.hub = FALSE)
head(dat,n = 16)
dat[16,1] = 0
dat = as.data.frame(dat)
dat$value = as.numeric(dat$value)
colnames(dat) = colnames(otu)[i]
if (i == 1) {
dat.f = dat
} else {
dat.f = cbind(dat.f,dat)
}
}
head(dat.f)
dat.f = dat.f %>%
t() %>%
as.data.frame()
select.env = select.env
# env1$id = row.names(env1)
# env1 = env1 %>% dplyr::select(id,everything()) %>% select(id,select.env )
head(dat.f)
dat.f$id = row.names(dat.f)
dat.f = dat.f %>% dplyr:: select(id,everything())
tab = dat.f %>% left_join(subenv,by = "id")
head(tab)
mtcars2 = melt(tab, id.vars=c(select.env,"id"))
lab = mean(mtcars2[,select.env])
head(mtcars2)
p0_1 = ggplot2::ggplot(mtcars2,aes(x= value,!!sym(select.env), colour=variable)) +
ggplot2::geom_point() +
ggpubr::stat_cor(label.y=lab*1.1)+
ggpubr::stat_regline_equation(label.y=lab*1.1,vjust = 2) +
facet_wrap(~variable, scales="free_x") +
geom_smooth(aes(value,!!sym(select.env), colour=variable), method=lm, se=T)+
theme_classic()
p0_1
filename = paste(corpath,"/cor_netowrkpropertities",select.env,".pdf",sep = "")
ggsave(filename,p0_1,width = 16,height = 18)
}}
res1path <- "results/Other_microbiome_analysis/"
library(tidyverse)
library(igraph)
corpath = paste(res1path,"./env_difference_plot/",sep = "")
dir.create(corpath)
ps = readRDS("./data/dataNEW/ps_16s.rds")
net.property_env(
ps = ps,
corpath = corpath,
Top = 500,
r.threshold= 0.8,
p.threshold=0.05,
env = env,
select.mod = c("model_1","model_2","model_3"),
select.env = "pH")
[1] "1"
[1] "1"
[1] "1"
res1path <- "results/Other_microbiome_analysis/"
Envnetplot<- paste(res1path,"./16s_ITS_Env_network",sep = "")
dir.create(Envnetplot)
ps16s = readRDS("./data/dataNEW//ps_16s.rds") %>% ggClusterNet::scale_micro()
psITS = readRDS("./data/dataNEW//ps_ITS.rds")%>% ggClusterNet::scale_micro()
library(phyloseq)
#--细菌和真菌ps对象中的map文件要一样
ps.merge <- ggClusterNet::merge16S_ITS(ps16s = ps16s,
psITS = psITS,
NITS = 200,
N16s = 200)
ps.merge
phyloseq-class experiment-level object
otu_table() OTU Table: [ 400 taxa and 18 samples ]
sample_data() Sample Data: [ 18 samples by 2 sample variables ]
tax_table() Taxonomy Table: [ 400 taxa by 8 taxonomic ranks ]
map = phyloseq::sample_data(ps.merge)
# head(map)
map$Group = "one"
phyloseq::sample_data(ps.merge) <- map
data1 = env
envRDA.s = vegan::decostand(envRDA,"hellinger")
data1[,-1] = envRDA.s
Gru = data.frame(ID = colnames(env)[-1],group = "env" )
head(Gru)
ID group
1 pH env
2 SOC env
3 TN env
4 NH4.N env
5 NO3.N env
6 AP env
library(sna)
library(ggClusterNet)
library(igraph)
result <- ggClusterNet::corBionetwork(ps = ps.merge,
N = 0,
r.threshold = 0.6, # 相关阈值
p.threshold = 0.05,
big = TRUE,
group = "Group",
env = data1, # 环境指标表格
envGroup = Gru,# 环境因子分组文件表格
layout_net = "model_maptree2",
path = Envnetplot,# 结果文件存储路径
fill = "Phylum", # 出图点填充颜色用什么值
size = "igraph.degree", # 出图点大小用什么数据
scale = TRUE, # 是否要进行相对丰度标准化
bio = TRUE, # 是否做二分网络
zipi = FALSE, # 是否计算ZIPI
step = 100, # 随机网络抽样的次数
width = 18,
label = TRUE,
height = 10
)
[1] "one"
num [1:34, 1:18] 0.0474 0.0633 0.0188 0.0588 0.069 ...
- attr(*, "dimnames")=List of 2
..$ : chr [1:34] "pH" "SOC" "TN" "NH4.N" ...
..$ : chr [1:18] "sample1" "sample10" "sample11" "sample12" ...
[1] "1"
[1] "2"
[1] "3"
p = result[[1]]
p
# 全部样本网络参数比对
data = result[[2]]
plotname1 = paste(Envnetplot,"/network_all.jpg",sep = "")
ggsave(plotname1, p,width = 15,height = 12,dpi = 72)
plotname1 = paste(Envnetplot,"/network_all.png",sep = "")
ggsave(plotname1, p,width = 10,height = 8,dpi = 72)
plotname1 = paste(Envnetplot,"/network_all.pdf",sep = "")
ggsave(plotname1, p,width = 15,height = 12)
tablename <- paste(Envnetplot,"/co-occurrence_Grobel_net",".csv",sep = "")
write.csv(data,tablename)
# 仅仅关注细菌和真菌之间的相关,不关注细菌内部和真菌内部相关
# Only focuses on the correlation between bacteria and fungi, not the correlation within bacteria and fungi
res1path <- "results/Other_microbiome_analysis/"
Envnetplot<- paste(res1path,"./16S_ITS_network",sep = "")
dir.create(Envnetplot)
ps16s = readRDS("./data/dataNEW/ps_16s.rds")
psITS = readRDS("./data/dataNEW/ps_ITS.rds")
#--细菌和真菌ps对象中的map文件要一样(The map files in the bacteria and fungi PS objects must be the same)
ps.merge <- ggClusterNet::merge16S_ITS(ps16s = ps16s,
psITS = psITS,
N16s = 300,
NITS = 300
)
ps.merge
phyloseq-class experiment-level object
otu_table() OTU Table: [ 600 taxa and 18 samples ]
sample_data() Sample Data: [ 18 samples by 2 sample variables ]
tax_table() Taxonomy Table: [ 600 taxa by 8 taxonomic ranks ]
map = phyloseq::sample_data(ps.merge)
# head(map)
map$Group = "one"
phyloseq::sample_data(ps.merge) <- map
data = NULL
library(sna)
library(igraph)
library(ggClusterNet)
library(phyloseq)
result <- corBionetwork(ps = ps.merge,
N = 0,
lab = data,
r.threshold = 0.6,
p.threshold = 0.05,
group = "Group",
# env = data1,
# envGroup = Gru,
layout_net = "model_maptree2",
path = Envnetplot,
fill = "Phylum",
size = "igraph.degree",
scale = TRUE,
bio = TRUE,
zipi = F,
step = 100,
width = 12,
label = TRUE,
height = 10,
big = TRUE,
select_layout = TRUE,
# layout_net = "model_maptree",
clu_method = "cluster_fast_greedy"
)
[1] "one"
[1] "1"
[1] "2"
[1] "3"
tem <- model_maptree(cor =result[[5]],
method = "cluster_fast_greedy",
seed = 12)
node_model = tem[[2]]
head(node_model)
ID group degree
1 fun_ASV_311 14 72
2 fun_ASV_203 14 71
3 fun_ASV_246 14 70
4 fun_ASV_68 14 63
5 fun_ASV_268 14 62
6 fun_ASV_76 14 62
p = result[[1]]
p
# 全部样本网络参数比对
# Comparison of all sample network parameters
data = result[[2]]
plotname1 = paste(Envnetplot,"/network_all.pdf",sep = "")
ggsave(plotname1, p,width = 10,height = 8)
tablename <- paste(Envnetplot,"/co-occurrence_Grobel_net",".csv",sep = "")
write.csv(data,tablename)
tablename <- paste(Envnetplot,"/node_model_imformation",".csv",sep = "")
write.csv(node_model,tablename)
tablename <- paste(Envnetplot,"/nodeG_plot",".csv",sep = "")
write.csv(result[[4]],tablename)
tablename <- paste(Envnetplot,"/edge_plot",".csv",sep = "")
write.csv(result[[3]],tablename)
tablename <- paste(Envnetplot,"/cor_matrix",".csv",sep = "")
write.csv(result[[5]],tablename)
library(tidyverse)
res1path <- "results/Other_microbiome_analysis/"
# res1path = "result_and_plot/Micro_and_other_index_16s/"
Envnetplot<- paste(res1path,"/16S_ITS_network_Genus",sep = "")
dir.create(Envnetplot)
ps16s = readRDS("./data/dataNEW/ps_16s.rds")
psITS = readRDS("./data/dataNEW/ps_ITS.rds")
#--细菌和真菌ps对象中的map文件要一样(The map files in the bacteria and fungi PS objects must be the same)
ps.merge <- merge16S_ITS(ps16s = ps16s,
psITS = psITS,
N16s = 300,
NITS = 300
)
ps.merge
phyloseq-class experiment-level object
otu_table() OTU Table: [ 600 taxa and 18 samples ]
sample_data() Sample Data: [ 18 samples by 2 sample variables ]
tax_table() Taxonomy Table: [ 600 taxa by 8 taxonomic ranks ]
map = phyloseq::sample_data(ps.merge)
# head(map)
map$Group = "one"
phyloseq::sample_data(ps.merge) <- map
tem.0 = ps.merge %>% tax_glom_wt(ranks = "Genus")
tax = tem.0 %>% vegan_tax() %>%
as.data.frame()
head(tax)
filed Kingdom Phylum Class
Acidocella bac Bacteria Proteobacteria Alphaproteobacteria
Acrobeloides fun Metazoa Nematoda Chromadorea
Anteholosticha fun Alveolata Ciliophora Spirotrichea
Aquicella bac Bacteria Proteobacteria Gammaproteobacteria
Arcopilus fun Fungi Ascomycota Sordariomycetes
Arenimonas bac Bacteria Proteobacteria Gammaproteobacteria
Order Family Genus
Acidocella Rhodospirillales Acetobacteraceae Acidocella
Acrobeloides Rhabditida Cephalobidae Acrobeloides
Anteholosticha Urostylida Holostichidae Anteholosticha
Aquicella Legionellales Coxiellaceae Aquicella
Arcopilus Sordariales Chaetomiaceae Arcopilus
Arenimonas Xanthomonadales Xanthomonadaceae Arenimonas
data = NULL
library(sna)
library(igraph)
library(ggClusterNet)
library(phyloseq)
library(WGCNA)
result <- corBionetwork(ps = tem.0,
N = 0,
lab = data,
r.threshold = 0.6,
p.threshold = 0.05,
group = "Group",
# env = data1,
# envGroup = Gru,
# layout = "fruchtermanreingold",
path = Envnetplot,
fill = "Phylum",
size = "igraph.degree",
scale = TRUE,
bio = TRUE,
zipi = F,
step = 100,
width = 12,
label = TRUE,
height = 10,
big = TRUE,
select_layout = TRUE,
layout_net = "model_maptree2",
clu_method = "cluster_fast_greedy"
)
[1] "one"
[1] "1"
[1] "2"
[1] "3"
tem <- model_maptree(cor =result[[5]],
method = "cluster_fast_greedy",
seed = 12)
node_model = tem[[2]]
head(node_model)
ID group degree
1 Microascus 8 15
2 Hydrogonium 4 8
3 Conlarium 3 7
4 Mortierella 8 6
5 Arcopilus 6 5
6 Humicola 6 5
otu = tem.0 %>% vegan_otu() %>%
as.data.frame()
node_model = node_model[match(colnames(otu),node_model$ID),]
MEList = moduleEigengenes(otu, colors = node_model$group)
MEs = MEList$eigengenes
nGenes = ncol(otu)
nSamples = nrow(otu)
moduleTraitCor = cor(MEs, envRDA, use = "p")
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples)
#sizeGrWindow(10,6)
pdf(file=paste(Envnetplot,"/","Module-env_relationships.pdf",sep = ""),width=10,height=6)
# Will display correlations and their p-values
textMatrix = paste(signif(moduleTraitCor, 2), "\n(",
signif(moduleTraitPvalue, 1), ")", sep = "")
dim(textMatrix) = dim(moduleTraitCor)
par(mar = c(6, 8.5, 3, 3))
# Display the correlation values within a heatmap plot
labeledHeatmap(Matrix = moduleTraitCor,
xLabels = names(envRDA),
yLabels = names(MEs),
ySymbols = names(MEs),
colorLabels = FALSE,
colors = greenWhiteRed(50),
textMatrix = textMatrix,
setStdMargins = FALSE,
cex.text = 0.5,
zlim = c(-1,1),
main = paste("Module-trait relationships"))
dev.off()
png
2
p = result[[1]]
p
# 全部样本网络参数比对
# Comparison of all sample network parameters
data = result[[2]]
plotname1 = paste(Envnetplot,"/network_all.pdf",sep = "")
ggsave(plotname1, p,width = 10,height = 8)
tablename <- paste(Envnetplot,"/co-occurrence_Grobel_net",".csv",sep = "")
write.csv(data,tablename)
tablename <- paste(Envnetplot,"/nodeG_plot",".csv",sep = "")
write.csv(result[[4]],tablename)
tablename <- paste(Envnetplot,"/edge_plot",".csv",sep = "")
write.csv(result[[3]],tablename)
tablename <- paste(Envnetplot,"/cor_matrix",".csv",sep = "")
write.csv(result[[5]],tablename)
library(picante)
library(ape)
library(vegan)
library(FSA)
library(eulerr)
library(grid)
library(gridExtra)
require(minpack.lm)
require(Hmisc)
require(stats4)
library(parallel)
library(ggClusterNet)
library(phyloseq)
res1path = "results/Other_microbiome_analysis/"
phypath = paste(res1path,"./Phylogenetic_analyse_spacies/",sep = "")
dir.create(phypath)
neutralModel = function(otu = NULL,
tax = NULL,
map = NULL,
tree = NULL,
ps = NULL,
group = "Group",
ncol = 3,
nrow = 1
){
# 抽平,默认使用最小序列抽平
# Leveling, the default is to use the minimum sequence leveling
ps = inputMicro(otu,tax,map,tree,ps,group = group)
ps
set.seed(72) #设置随机种子,保证结果可重复(Set a random seed to ensure repeatable results)
psrare = rarefy_even_depth(ps)
# 标准化
ps.norm = transform_sample_counts(psrare, function(x) x/sum(x))
#------------------------------------------开始计算中性模型(Start calculation of neutral model)----------------------------------------------------------
map = as.data.frame(sample_data(psrare))
aa = levels(map$Group)
aa
map$ID = row.names(map)
plots = list()
dat1 = list()
dat2 = list()
i =1
for (i in 1:length(aa)) {
maps<- dplyr::filter(as.tibble(map),Group %in%aa[i])
maps = as.data.frame(maps)
row.names(maps) = maps$ID
ps_sub = psrare
sample_data(ps_sub) =maps ;ps_sub
# 提取OTU表格(Extract OTU table)
OTU.table = t(otu_table(ps_sub))
head(OTU.table )
# 将整个群落看做一个整体,计算每个样本的序列数,并求取均值
# Calculate the number of individuals in the meta community (Average read depth)
N <- mean(apply(OTU.table, 1, sum))
##计算每个OTU的的平均序列数
# Calculate the average relative abundance of each taxa across communities
p.m <- apply(OTU.table, 2, mean)
# 去除OTU序列数为0的OTU
# Remove OTUs with OTU sequence number of 0
p.m <- p.m[p.m != 0]
p <- p.m/N
p.df = data.frame(p) %>%
rownames_to_column(var="OTU")
# Calculate the occurrence frequency of each taxa
OTU.table.bi <- 1*(OTU.table>0)
freq.table <- apply(OTU.table.bi, 2, mean)
freq.table <- freq.table[freq.table != 0]
freq.df = data.frame(OTU=names(freq.table), freq=freq.table)
#Combine
C <- inner_join(p.df,freq.df, by="OTU") %>%
arrange(p)
# Remove rows with any zero (absent in either source pool or local communities). You already did this, but just to make sure we will do it again.
C.no0 <- C %>%
filter(freq != 0, p != 0)
#Calculate the limit of detection
d <- 1/N
##Fit model parameter m (or Nm) using Non-linear least squares (NLS)
p.list <- C.no0$p
freq.list <- C.no0$freq
m.fit <- nlsLM(freq.list ~ pbeta(d, N*m*p.list, N*m*(1-p.list), lower.tail=FALSE), start=list(m=0.1))
m.ci <- confint(m.fit, 'm', level=0.95)
m.sum <- summary(m.fit)
m.coef = coef(m.fit)
freq.pred <- pbeta(d, N*coef(m.fit)*p.list, N*coef(m.fit)*(1-p.list), lower.tail=FALSE)
Rsqr <- 1 - (sum((freq.list - freq.pred)^2))/(sum((freq.list - mean(freq.list))^2))
# Get table of model fit stats
fitstats <- data.frame(m=m.coef, m.low.ci=m.ci[1], m.up.ci=m.ci[2],
Rsqr=Rsqr, p.value=m.sum$parameters[4], N=N,
Samples=nrow(OTU.table), Richness=length(p.list),
Detect=d)
# Get confidence interval for predictions
freq.pred.ci <- binconf(freq.pred*nrow(OTU.table), nrow(OTU.table), alpha=0.05, method="wilson", return.df=TRUE)
# Get table of predictions
pred.df <- data.frame(metacomm_RA=p.list, frequency=freq.pred,
frequency_lowerCI=freq.pred.ci[,2],
frequency_upperCI=freq.pred.ci[,3]) %>%
unique()
# Get table of observed occupancy and abundance
obs.df = C.no0 %>%
dplyr::rename(metacomm_RA = p, frequency=freq)
head(obs.df)
p = ggplot(data=obs.df) +
geom_point(data=obs.df, aes(x=log10(metacomm_RA), y=frequency),
alpha=.3, size=2, color="#8DD3C7") +
geom_line(data=pred.df, aes(x=log10(metacomm_RA), y=frequency), color="#FFFFB3") +
geom_line(data=pred.df, aes(x=log10(metacomm_RA), y=frequency_lowerCI), linetype=2, color="#FFFFB3") +
geom_line(data=pred.df, aes(x=log10(metacomm_RA), y=frequency_upperCI), linetype=2, color="#FFFFB3") +
# geom_text(data=fitstats, aes(label = paste("R^2 == ", round(Rsqr, 3))),
# x=1, y=0.75, size=4, parse=TRUE) +
# geom_text(data=fitstats, aes(label = paste("italic(m) ==", round(m, 3))),
# x=-1, y=0.85, size=4, parse=TRUE) +
labs(x="Log10 abundance in\nmetacommunity", y="Frequency detected",title = paste(aa[i],paste("R^2 == ", round(fitstats$Rsqr, 3)),paste("italic(m) ==", round(fitstats$m, 3)))) +
theme_bw() +
theme(axis.line = element_line(color="black"),
legend.position = "none",
axis.title = element_text(size=14),
axis.text = element_text(size=12))
p
plots[[aa[i]]] = p
dat1[[aa[i]]] = obs.df
dat2[[aa[i]]] = pred.df
}
# plots$ABCD
# library(ggpubr)
# nrow=2,,ncol=4
p = ggpubr::ggarrange(plotlist = plots,common.legend = TRUE, legend="right",ncol = ncol,nrow = nrow)
p
return(list(p,plots,dat1,dat2))
}
ps = readRDS("./data/dataNEW/ps_16s.rds")
result = neutralModel(ps = ps,group = "Group",ncol = 3)
#--合并图表(Merge charts)
p1 = result[[1]]
p1
FileName <- paste(phypath,"./1_neutral_modelCul", ".pdf", sep = "")
ggsave(FileName, p1,width = 12,height = 4)
FileName <- paste(phypath,"./1_neutral_modelCul", ".png", sep = "")
ggsave(FileName, p1,width = 12,height = 4)
#--系统发育信号(Phylogenetic signal)
phyloSignal = function(otu = NULL,
tax = NULL,
map = NULL,
tree = NULL ,
ps = NULL,
env = env,
group = "Group",
path = "./"){
# 抽平,默认使用最小序列抽平
# Leveling, the default is to use the minimum sequence leveling
ps = inputMicro(otu,tax,map,tree,ps,group = group)
ps
set.seed(72)
psrare = rarefy_even_depth(ps)
# 标准化(Standardized)
ps.norm = transform_sample_counts(psrare, function(x) x/sum(x))
map = as.data.frame(sample_data(psrare))
mapE =merge(map,env,by = "row.names",all= TRUE)
row.names(mapE) = mapE$Row.names
mapE$Row.names = NULL
mapE$ID = row.names(mapE)
sample_data(ps.norm) = mapE
aa = levels(mapE$Group)
dir.create(path)
#----------分组计算门特尔相关,将结果保存,因为计算时间很长,只需计算一个就好了#-------
# Calculate the Mentel correlation in groups and save the results. Since the calculation time is very long, only one calculation is needed.
eco = "Endosp."
for (eco in as.character(unique(mapE$Group))){
# Subset data
print(paste("Now running", eco))
# sub.physeq = phyloseq::subset_samples(ps.norm , Group == eco)
sub.physeq = ps.norm
otu = as.data.frame(vegan_otu(ps.norm))
head(otu)
map = as.data.frame(sample_data(ps.norm))
mapsub <- map[map$Group == eco,]
sample_data(sub.physeq) = mapsub
# Remove OTUs not found in at least 3 samples
OTU.table = otu_table(sub.physeq)
OTU.table[OTU.table > 0] = 1
OTU.freq = rowSums(OTU.table)
OTU.freq = OTU.freq[OTU.freq > 2]
sub.physeq = prune_taxa(names(OTU.freq), sub.physeq)
sub.physeq
# get phylogenetic distances
tree = phy_tree(sub.physeq)
phylo.dist = cophenetic(tree)
sample_OTUs = tree$tip.label
sam.phylo.dist = phylo.dist[sample_OTUs, sample_OTUs]
sam.phylo.dist[upper.tri(sam.phylo.dist, diag=TRUE)] = NA
# Generate dataframe of niche preference for pH, SOC and CN
# site.chem.mat = data.frame(sample_data(sub.physeq)) %>%
# # mutate(CN = percent_C / percent_N) %>%
# dplyr::select(ID, colnames(env))
site.chem.mat = env[row.names(env) %in% row.names(mapsub),]
# rownames(site.chem.mat) = site.chem.mat$ID
# site.chem.mat$ID = NULL
site.chem.mat = as.matrix(site.chem.mat)
otu.table = t(otu_table(sub.physeq))
# head(otu.table)
match(row.names(otu.table),row.names(site.chem.mat))
OTU.niche = wascores(site.chem.mat, otu.table)
OTU.niche.df = data.frame(OTU.niche)
head( OTU.niche.df)
# i =1
for (i in 1:dim(OTU.niche.df)[2]) {
pH.pref = OTU.niche.df[[i]]
names(pH.pref) = rownames(OTU.niche.df)
pH.dist = as.matrix(dist(pH.pref), labels=TRUE)
sam.pH.dist = pH.dist[sample_OTUs, sample_OTUs]
sam.pH.dist[upper.tri(sam.pH.dist, diag=TRUE)] = NA
sam.pH.crlg = mantel.correlog(sam.pH.dist, sam.phylo.dist)
# ?mantel.correlog
filename = paste(path,eco,colnames(OTU.niche.df[i]), "_crlg.rds", sep="_")
saveRDS(sam.pH.crlg, file=filename)
}
}
}
phySigPlot = function(otu = NULL,
tax = NULL,
map = NULL,
tree = NULL,
ps = NULL,
group = "Group",
env = env,
path = "./"){
# 抽平,默认使用最小序列抽平(Leveling, the default is to use the minimum sequence leveling)
ps = inputMicro(otu,tax,map,tree,ps,group = group)
ps
mapE = as.data.frame(sample_data(ps))
for (eco in levels(mapE$Group)) {
# eco = "KO"
# i = 1
for (i in 1:length(colnames(env))) {
ag.pH.crlg = data.frame(readRDS(file=paste(path,eco,colnames(env[i]), "_crlg.rds", sep="_"))$mantel.res) %>%
mutate(Group = eco, property = colnames(env)[i])
if (i == 1) {
data = ag.pH.crlg
}
if (i != 1) {
data = rbind(data,ag.pH.crlg )
}
}
if (eco == levels(mapE$Group)[1]) {
data2 = data
}
if (eco != levels(mapE$Group)[1]) {
data2 = rbind(data2, data)
}
}
dim(data2)
eco.crlg = data2 %>%
mutate(sig = ifelse(Pr.corrected. <= 0.05, "significant", "non-significant")) %>%
filter(!(is.na(Pr.corrected.)))
eco.crlg$Group= factor(eco.crlg$Group)
p = ggplot(data=eco.crlg, aes(x=class.index, y=Mantel.cor)) +
geom_point(data=eco.crlg[eco.crlg$sig=="significant",], color = "black", size=2, shape=16) +
geom_point(data=eco.crlg[eco.crlg$sig=="non-significant",], color = "black",size=2, shape=1) +
geom_line(data=eco.crlg, aes(color=property)) +
geom_hline(yintercept = 0, linetype=2) +
labs(x = "Phylogenetic distance class", y="Mantel correlation", color="property") +
# facet_grid(~Group)
facet_wrap(~Group,scales="free_y",ncol = 4)
return(list(p,eco.crlg,data2))
}
env = read.csv("./data/dataNEW/env.csv")
head(env)
ID pH SOC TN NH4.N NO3.N AP AK CN LA Height TG RGR
1 sample1 4.45 7.96 0.70 6.86 9.45 36.2 74.2 11.37 35.6 63 0.609 0.863
2 sample2 4.55 9.12 0.89 5.78 10.71 49.5 68.5 10.24 46.8 67 0.625 0.976
3 sample3 4.47 7.58 0.92 6.27 11.85 50.8 68.9 8.24 44.6 48 0.886 1.340
4 sample4 4.63 8.66 0.65 5.49 10.22 55.7 58.7 13.28 39.7 53 0.644 0.801
5 sample5 4.38 9.59 0.74 6.55 9.84 44.4 60.8 13.04 50.7 55 0.591 0.665
6 sample6 4.52 9.88 0.63 5.66 9.73 39.9 70.4 15.74 42.5 55 0.751 1.145
LB SB RB R.S FRB LRW1 LRW2 LRW3 LRN1 LRN2 LRN3 LRL1 LRL2 LRL3 LRD1
1 48.4 275 315 0.975 91.4 35.3 29.4 26.7 19 41 96 31.7 132 131.9 0.188
2 70.1 213 380 1.340 93.1 36.1 29.5 27.5 30 47 56 34.5 145 84.2 0.126
3 61.1 253 351 1.117 89.7 30.0 31.2 28.5 26 41 110 26.4 108 181.3 0.179
4 74.7 220 309 1.048 96.9 38.5 30.6 27.9 22 33 84 25.3 153 156.6 0.208
5 58.2 309 395 1.078 89.4 32.6 29.8 26.9 28 51 103 28.6 132 77.3 0.135
6 70.4 255 356 1.095 92.3 33.6 30.3 28.4 25 42 131 26.8 220 211.1 0.241
LRD2 LRD3 MaxO TLRN TLRL BI MID
1 0.0655 0.058 4 156 296 1.54 0.619
2 0.0700 0.069 5 133 263 1.80 1.780
3 0.0660 0.066 6 177 316 2.67 1.373
4 0.0940 0.086 5 139 335 1.37 0.811
5 0.0680 0.071 7 182 238 1.88 0.700
6 0.0700 0.068 4 198 457 1.73 0.602
envRDA = env
head(env)
ID pH SOC TN NH4.N NO3.N AP AK CN LA Height TG RGR
1 sample1 4.45 7.96 0.70 6.86 9.45 36.2 74.2 11.37 35.6 63 0.609 0.863
2 sample2 4.55 9.12 0.89 5.78 10.71 49.5 68.5 10.24 46.8 67 0.625 0.976
3 sample3 4.47 7.58 0.92 6.27 11.85 50.8 68.9 8.24 44.6 48 0.886 1.340
4 sample4 4.63 8.66 0.65 5.49 10.22 55.7 58.7 13.28 39.7 53 0.644 0.801
5 sample5 4.38 9.59 0.74 6.55 9.84 44.4 60.8 13.04 50.7 55 0.591 0.665
6 sample6 4.52 9.88 0.63 5.66 9.73 39.9 70.4 15.74 42.5 55 0.751 1.145
LB SB RB R.S FRB LRW1 LRW2 LRW3 LRN1 LRN2 LRN3 LRL1 LRL2 LRL3 LRD1
1 48.4 275 315 0.975 91.4 35.3 29.4 26.7 19 41 96 31.7 132 131.9 0.188
2 70.1 213 380 1.340 93.1 36.1 29.5 27.5 30 47 56 34.5 145 84.2 0.126
3 61.1 253 351 1.117 89.7 30.0 31.2 28.5 26 41 110 26.4 108 181.3 0.179
4 74.7 220 309 1.048 96.9 38.5 30.6 27.9 22 33 84 25.3 153 156.6 0.208
5 58.2 309 395 1.078 89.4 32.6 29.8 26.9 28 51 103 28.6 132 77.3 0.135
6 70.4 255 356 1.095 92.3 33.6 30.3 28.4 25 42 131 26.8 220 211.1 0.241
LRD2 LRD3 MaxO TLRN TLRL BI MID
1 0.0655 0.058 4 156 296 1.54 0.619
2 0.0700 0.069 5 133 263 1.80 1.780
3 0.0660 0.066 6 177 316 2.67 1.373
4 0.0940 0.086 5 139 335 1.37 0.811
5 0.0680 0.071 7 182 238 1.88 0.700
6 0.0700 0.068 4 198 457 1.73 0.602
row.names(envRDA) = env$ID
envRDA$ID = NULL
head(envRDA)
pH SOC TN NH4.N NO3.N AP AK CN LA Height TG RGR LB
sample1 4.45 7.96 0.70 6.86 9.45 36.2 74.2 11.37 35.6 63 0.609 0.863 48.4
sample2 4.55 9.12 0.89 5.78 10.71 49.5 68.5 10.24 46.8 67 0.625 0.976 70.1
sample3 4.47 7.58 0.92 6.27 11.85 50.8 68.9 8.24 44.6 48 0.886 1.340 61.1
sample4 4.63 8.66 0.65 5.49 10.22 55.7 58.7 13.28 39.7 53 0.644 0.801 74.7
sample5 4.38 9.59 0.74 6.55 9.84 44.4 60.8 13.04 50.7 55 0.591 0.665 58.2
sample6 4.52 9.88 0.63 5.66 9.73 39.9 70.4 15.74 42.5 55 0.751 1.145 70.4
SB RB R.S FRB LRW1 LRW2 LRW3 LRN1 LRN2 LRN3 LRL1 LRL2 LRL3 LRD1
sample1 275 315 0.975 91.4 35.3 29.4 26.7 19 41 96 31.7 132 131.9 0.188
sample2 213 380 1.340 93.1 36.1 29.5 27.5 30 47 56 34.5 145 84.2 0.126
sample3 253 351 1.117 89.7 30.0 31.2 28.5 26 41 110 26.4 108 181.3 0.179
sample4 220 309 1.048 96.9 38.5 30.6 27.9 22 33 84 25.3 153 156.6 0.208
sample5 309 395 1.078 89.4 32.6 29.8 26.9 28 51 103 28.6 132 77.3 0.135
sample6 255 356 1.095 92.3 33.6 30.3 28.4 25 42 131 26.8 220 211.1 0.241
LRD2 LRD3 MaxO TLRN TLRL BI MID
sample1 0.0655 0.058 4 156 296 1.54 0.619
sample2 0.0700 0.069 5 133 263 1.80 1.780
sample3 0.0660 0.066 6 177 316 2.67 1.373
sample4 0.0940 0.086 5 139 335 1.37 0.811
sample5 0.0680 0.071 7 182 238 1.88 0.700
sample6 0.0700 0.068 4 198 457 1.73 0.602
phypath2 = paste(phypath,"/phyloSignal/",sep = "")
dir.create(phypath)
phyloSignal(ps = ps %>% filter_OTU_ps(400),
group = "Group",
env = envRDA[,2:3],
path = phypath2)
[1] "Now running Group1"
[1] "Now running Group2"
[1] "Now running Group3"
result = phySigPlot(ps = ps,group = "Group",env = envRDA[,2:3],path = phypath2)
#提取图片(Extract images)
p2 = result[[1]] + mytheme1
p2
#-提取作图数据(Extracting mapping data)
data = result[[2]]
head(data)
class.index n.dist Mantel.cor Pr.Mantel. Pr.corrected. Group property
D.cl.1 0.0681 713 0.01356 0.009 0.009 Group1 SOC
D.cl.2 0.1954 1021 0.00614 0.271 0.271 Group1 SOC
D.cl.3 0.3227 1915 0.01013 0.159 0.318 Group1 SOC
D.cl.4 0.4499 2073 0.00511 0.298 0.542 Group1 SOC
D.cl.5 0.5772 4801 0.01436 0.185 0.636 Group1 SOC
D.cl.6 0.7045 10999 0.04456 0.013 0.065 Group1 SOC
sig
D.cl.1 significant
D.cl.2 non-significant
D.cl.3 non-significant
D.cl.4 non-significant
D.cl.5 non-significant
D.cl.6 non-significant
FileName <- paste(phypath,"2_phySigPlot", ".pdf", sep = "")
ggsave(FileName, p2,width = 15,height = 6)
FileName <- paste(phypath,"2_phySigPlot", ".csv", sep = "")
write.csv(data,FileName)
nullModel <- function(otu = NULL,
tax = NULL,
map = NULL,
tree = NULL ,
ps = NULL,
group = "Group",
dist.method = "bray",
gamma.method = "total",
transfer = "none",
null.model = "ecosphere"){
ps = inputMicro(otu,tax,map,tree,ps,group = group)
map = as.data.frame(sample_data(ps))
grp1 = unique(map$Group)
grp=list()
### 制作分组列表(Make a group list)
for (i in 1:length(grp1)) {
grp[[i]]=rownames(map)[which(map$Group==grp1[i])]
}
names(grp) = grp1
report = c()
dat4anova = c()
grp4anova = c()
report.ES = c()
report.SES = c()
# x=17
otu = as.data.frame(t(vegan_otu(ps)))
otu = as.matrix(otu)
for(x in c(1:length(grp))){ #
#print(paste("Group",x))
dataCK1 = otu[,grp[[x]]]
##delete empty rows
if(gamma.method == "group"){
rsum1 = rowSums(dataCK1)
tempCK1 = which(rsum1==0)
if(length(tempCK1)!=0) {dataCK1 = dataCK1[-tempCK1,]}
}
# 分组,对一组计算距离(Grouping, calculating the distance for a group)
beta.dist = vegdist(t(dataCK1),method = dist.method)
# 转化为相似性距离(Convert to similarity distance)
similarity.ob = 1 - beta.dist
#similarity.ob.sd = sd(1-beta.dist, na.rm=TRUE)
# 统计有多少个OTU(Count how many OTUs there are)
gamma = nrow(dataCK1)
# 统计每个样本的OTU数量(Count the number of OTUs in each sample)
alpha = colSums(dataCK1>0)
# OTU求和(OTU summation)
if(gamma.method == "group"){
occur = apply(dataCK1, MARGIN=1, FUN=sum)
}else{
occur = apply(otu, MARGIN=1, FUN=sum) #otu[valid.row,]
}
#print(paste(similarity.ob, similarity.ob.sd))
r = 100
# 构建样本矩阵,空矩阵(Construct sample matrix, empty matrix)
similarity.pm = matrix(0, nrow=ncol(dataCK1), ncol=ncol(dataCK1))
similarity.pm = as.dist(similarity.pm)
# i = 1
for(i in 1:r){
# print(i)
# 构造OTU矩阵孔阵(Construct OTU matrix hole array)
PRM1 = matrix(0, ncol= ncol(dataCK1), nrow = nrow(dataCK1))
if(null.model == "ecosphere"){
# j = 1
for(j in 1:ncol(dataCK1)){
# 提取该样本otu大于0的全部otu
# Extract all otu of this sample whose otu is greater than 0
aa = dataCK1[dataCK1[,j]>0,j]
PRM1[sample(1:gamma, alpha[j], replace=FALSE, prob=occur), j] = aa
}
}else if(null.model == "ecosim"){
PRM1 = randomizeMatrix(dataCK1, null.model="independentswap")
}else if(null.model == "frequency"){
PRM1 = randomizeMatrix(dataCK1, null.model="frequency")
}
# 计算抽的的矩阵的距离(Calculate the distance of the extracted matrix)
dist_pm = vegdist(t(PRM1),method = dist.method)
# 将距离转化相似度放到之前构建的空阵中(Put the distance conversion similarity into the empty array constructed previously)
similarity.pm = similarity.pm + (1- dist_pm)
}
similarity.pm = similarity.pm/r
#plot(density(similarity.pm[i,]))
normality = shapiro.test(similarity.pm)#正态性检测(Normality test)
nor.p = normality$p.value
ttest = t.test(similarity.pm, similarity.ob, alternative="two.sided", paired = TRUE, conf.level = 0.95)
tt.p = ttest$p.value
conf.int = ttest$conf.int
pm.mean = mean(similarity.pm)
pm.sd = sd(similarity.pm)
ES = log(similarity.ob) - log(similarity.pm)
effect.size = mean(ES)
effect.size.sd = sd(ES)
SES = (similarity.ob - similarity.pm)/pm.sd
sd.effect.size = mean(SES)
sd.effect.size.sd = sd(SES)
ratio = 1 - similarity.pm / similarity.ob
ratio.mean = mean(ratio)
ratio.sd = sd(ratio)
dat4anova = c(dat4anova, as.vector(ratio))
grp4anova = c(grp4anova, rep(names(grp)[x], length(ratio)))
conf.int.str = paste("[",paste(signif(conf.int,digits=3),collapse="~"),"]",sep="")
report = rbind(report, c(mean(similarity.ob),sd(similarity.ob), pm.mean, pm.sd, conf.int.str, nor.p, tt.p , effect.size, effect.size.sd, sd.effect.size, sd.effect.size.sd, ratio.mean, ratio.sd))
report.ES = c(report.ES, effect.size)
report.SES = c(report.SES, sd.effect.size)
}
rownames(report) = grp1
colnames(report) = c("Mean of observed similarity", "Standard deviation of observed similarity",
"Mean of permutated similarity", "Standard deviation of permutated similarity",
"95% Conf int of perm similarity", "Normality test (p) on Perm similarity",
"T test on Ob and Perm similarity", "Effect size (ES)", "SD of ES",
"Standardized effect size (SES)", "SD of SES", "Mean of Ratio", "SD of Ratio")
head(report)
rep = t(report)
head(rep)
# 这个统计量代表不同群落之间是否有差异(This statistic represents whether there are differences between different groups)
##将零模型的统计检验结果保存到文件中(Save the results of the statistical test of the null model to a file.)
if (length(unique(grp4anova)) > 1) {
aov.re = aov(dat4anova ~ grp4anova)
} else {
aov.re = NULL
}
#---------------将比例保存起来备用(Save the ratio for later use)
ratio = data.frame(ratio = dat4anova,group = grp4anova)
return(list(rep,ratio,aov.re))
}
ps = readRDS("./data/dataNEW/ps_16s.rds")
psphy = filter_taxa(ps, function(x) sum(x ) > 1000 , TRUE);psphy
phyloseq-class experiment-level object
otu_table() OTU Table: [ 597 taxa and 18 samples ]
sample_data() Sample Data: [ 18 samples by 2 sample variables ]
tax_table() Taxonomy Table: [ 597 taxa by 7 taxonomic ranks ]
phy_tree() Phylogenetic Tree: [ 597 tips and 595 internal nodes ]
refseq() DNAStringSet: [ 597 reference sequences ]
result <- nullModel(ps = psphy,
group="Group",
dist.method = "bray",
gamma.method = "total",
transfer = "none",
null.model = "ecosphere"
)
#--分组零模型运行结果(Results of running the grouped null model)
nullModeltab <- result[[1]]
# 比例(Percentage)
ratiotab <- result[[2]]
#-统计量统计差异(Statistical Difference)
aovtab <- result[[3]]
FileName <- paste(phypath,"3_nullModeltab", ".csv", sep = "")
write.csv(nullModeltab,FileName)
FileName <- paste(phypath,"3_ratiotab", ".csv", sep = "")
write.csv(ratiotab,FileName)
conflicts_prefer(base::attr)
bNTICul = function(otu = NULL,tax = NULL,map = NULL,tree = NULL ,ps = NULL,group = "Group",num = 99,thread = 1){
ps = inputMicro(otu,tax,map,tree,ps,group = group)
ps
ps_sub <- ps
# tree = phy_tree(ps)
# tree
#-------------调整map文件(Adjust the map file)-----------------------------------------------------------------
#添加一个ID列(Add an ID column)
map = as.data.frame(sample_data(ps_sub))
map$ID = row.names(map)
sample_data(ps) = map
#-----------准备OTU表格(Prepare OTU table)---------------------抽平-不设置抽平条数,默认按照最小序列数数目抽平
#Leveling - Do not set the number of levels, the default is to level according to the minimum number of sequences
set.seed(72) # setting seed for reproducibility
psrare = rarefy_even_depth(ps_sub)
#检查序列数量(Check the number of sequences)
sample_sums(psrare)
# 标准化数据(Standardized data)
ps.norm = transform_sample_counts(psrare, function(x) x/sum(x))
# 计算βMNTD对每个随机零模型群落(Calculate βMNTD for each random null model population)
bMNTD_null_func <- function(i, OTU.table, tree){
tree$tip.label = sample(tree$tip.label)
bMNTD_s = comdistnt(OTU.table, cophenetic(tree), abundance.weighted = TRUE)
A <- attr(bMNTD_s, "Size")
B <- if (is.null(attr(bMNTD_s, "Labels"))) sequence(A) else attr(bMNTD_s, "Labels")
if (isTRUE(attr(bMNTD_s, "Diag"))) attr(bMNTD_s, "Diag") <- FALSE
if (isTRUE(attr(bMNTD_s, "Upper"))) attr(bMNTD_s, "Upper") <- FALSE
bMNTD_s.df = data.frame(Sample_1 = B[unlist(lapply(sequence(A)[-1], function(x) x:A))],
Sample_2 = rep(B[-length(B)], (length(B)-1):1),
bMNTD = as.vector(bMNTD_s),
rep=i)
return(bMNTD_s.df)
}
# 计算βNTI(Calculate βNTI)
Phylo_turnover <- function(physeq, reps, nproc){
# Extract OTU table
OTU.table = t(otu_table(physeq))
# Extract phylogenetic tree
tree = phy_tree(physeq)
# Get βMNTD between all communities
bMNTD_o = comdistnt(OTU.table, cophenetic(tree), abundance.weighted = TRUE)
A <- attr(bMNTD_o, "Size")
B <- if (is.null(attr(bMNTD_o, "Labels"))) sequence(A) else attr(bMNTD_o, "Labels")
if (isTRUE(attr(bMNTD_o, "Diag"))) attr(bMNTD_o, "Diag") <- FALSE
if (isTRUE(attr(bMNTD_o, "Upper"))) attr(bMNTD_o, "Upper") <- FALSE
bMNTD_o.df = data.frame(Sample_1 = B[unlist(lapply(sequence(A)[-1], function(x) x:A))],
Sample_2 = rep(B[-length(B)], (length(B)-1):1),
bMNTD = as.vector(bMNTD_o))
# Get βMNTD for randomized null communities
rep.list = seq(1, reps)
bMNTD_s.df.list = mclapply(rep.list, bMNTD_null_func, OTU.table=OTU.table, tree=tree, mc.cores=nproc)
# Combine all data together and calculate βNTI for each sample pair
bMNTD_s.df <- do.call("rbind", bMNTD_s.df.list)
bMNTD_s.means.df = bMNTD_s.df %>%
group_by(Sample_1, Sample_2) %>%
dplyr::summarize(mean_bMNTD = mean(bMNTD),
sd_bMNTD = sd(bMNTD))
bMNTD_o.df = inner_join(bMNTD_o.df, bMNTD_s.means.df, by=c("Sample_1", "Sample_2")) %>%
mutate(bNTI = (bMNTD - mean_bMNTD)/sd_bMNTD)
return(bMNTD_o.df)
}
#========这里一把单核就真实数据而言需要超过10个小时,跑999次,所以需要多核
# Here, a single core needs more than 10 hours for real data, running 999 times, so multiple cores are needed
# 计算bnti,这里可以设置线程数量,是第三个参数,我们在linux下面可以设置,30个线程
# Calculate bnti. Here you can set the number of threads, which is the third parameter. We can set it under Linux to 30 threads.
# 第二个参数设置迭代数量,这里文献一般999
# The second parameter sets the number of iterations, which is usually 999.
bNTI = Phylo_turnover(psrare, num, thread)
return(list(bNTI))
}
ps = readRDS("./data/dataNEW/ps_16s.rds")
psphy = filter_taxa(ps, function(x) sum(x ) > 1000 , TRUE);psphy
phyloseq-class experiment-level object
otu_table() OTU Table: [ 597 taxa and 18 samples ]
sample_data() Sample Data: [ 18 samples by 2 sample variables ]
tax_table() Taxonomy Table: [ 597 taxa by 7 taxonomic ranks ]
phy_tree() Phylogenetic Tree: [ 597 tips and 595 internal nodes ]
refseq() DNAStringSet: [ 597 reference sequences ]
result = bNTICul(ps = psphy,group = "Group",num = 100,thread = 1)
bNTI = result[[1]]
head(bNTI)
Sample_1 Sample_2 bMNTD mean_bMNTD sd_bMNTD bNTI
1 sample10 sample1 0.004675 0.003640 0.001402 0.738
2 sample11 sample1 0.007456 0.006143 0.002386 0.550
3 sample12 sample1 0.002150 0.001382 0.000688 1.117
4 sample13 sample1 0.002822 0.001092 0.000873 1.980
5 sample14 sample1 0.001703 0.001007 0.000783 0.889
6 sample15 sample1 0.000923 0.000493 0.000330 1.304
filename = paste(phypath,"/4_bNTI.csv",sep = "")
write.csv(bNTI, filename)
RCbary = function(otu = NULL,tax = NULL,map = NULL,tree = NULL ,ps = NULL,group = "Group",num = 99,thread = 1){
ps_sub <- ps
#----------------整理map文件(Organize map files)
map = as.data.frame(sample_data(ps_sub))
map$ID = row.names(map)
sample_data(ps) = map
#-------------------准备OTU表格(Prepare OTU table)
#-----------------抽平-不设置抽平条数,默认按照最小序列数数目抽平
#Leveling - Do not set the number of levels, the default is to level according to the minimum number of sequences
set.seed(72) # setting seed for reproducibility
psrare = rarefy_even_depth(ps_sub )
#检查序列数量(Check the number of sequences)
sample_sums(psrare)
# 标准化数据(Standardized data)
ps.norm = transform_sample_counts(psrare, function(x) x/sum(x))
#--------------两个函数(Two functions)
# 对模拟群落计算距离(Calculating distances for simulated communities)
RCbray_null_func <- function(i, freq.abd.df, alpha1, alpha2, N){
# Get simulated communities and distance
## initally select OTUs weighted by their frequency. The number of OTUs selected should equal the richness of the samples.
simcom1 = data.frame(table(sample(freq.abd.df$OTU, size=alpha1, replace=FALSE, prob=freq.abd.df$freq)), stringsAsFactors = F)
colnames(simcom1) = c("OTU","simcom1")
simcom1$OTU = as.character(simcom1$OTU)
simcom1 = inner_join(simcom1, freq.abd.df, by="OTU")
simcom2 = data.frame(table(sample(freq.abd.df$OTU, size=alpha2, replace=FALSE, prob=freq.abd.df$freq)), stringsAsFactors = F)
colnames(simcom2) = c("OTU","simcom2")
simcom2$OTU = as.character(simcom2$OTU)
simcom2 = inner_join(simcom2, freq.abd.df, by="OTU")
## Now recruit OTUs based on their abundance in the metacommunity
simcom1.abd = data.frame(table(sample(simcom1$OTU, size=N-alpha1, replace=TRUE, prob=simcom1$p)), stringsAsFactors = F)
colnames(simcom1.abd) = c("OTU","simcom1.abd")
simcom1.abd$OTU = as.character(simcom1.abd$OTU)
simcom1 = full_join(simcom1, simcom1.abd, by="OTU") %>%
mutate(simcom1.abd = ifelse(is.na(simcom1.abd), 1, simcom1.abd)) %>%
select(OTU, simcom1.abd)
simcom2.abd = data.frame(table(sample(simcom2$OTU, size=N-alpha2, replace=TRUE, prob=simcom2$p)), stringsAsFactors = F)
colnames(simcom2.abd) = c("OTU","simcom2.abd")
simcom2.abd$OTU = as.character(simcom2.abd$OTU)
simcom2 = full_join(simcom2, simcom2.abd, by="OTU") %>%
mutate(simcom2.abd = ifelse(is.na(simcom2.abd), 1, simcom2.abd)) %>%
select(OTU, simcom2.abd)
simcom = full_join(simcom1, simcom2, by="OTU")
simcom[is.na(simcom)] = 0
rownames(simcom) = simcom$OTU
simcom$OTU = NULL
null.dist = vegdist(t(simcom), method="bray")[1]
return(null.dist)
}
# 计算RCbray的主功能(Calculate the main function of RCbray)
Calc_RCbray <- function(physeq, reps, nproc){
# Get OTU table from phyloseq object
otu.table = otu_table(physeq)
# Get alpha diversity for each sample
otu.PA.table = otu.table
otu.PA.table[otu.PA.table > 0] = 1
alpha.df = data.frame(Sample_ID = colnames(otu.PA.table), OTU.n = colSums(otu.PA.table), stringsAsFactors = F)
# Get beta diversity matrix
beta.table = as.matrix(vegdist(t(otu.PA.table), method="bray", diag=TRUE, upper=TRUE))
## Get metacommunity
# Calculate the number of individuals in the meta community (Average read depth)
N <- mean(apply(t(otu.table), 1, sum))
# Calculate the average relative abundance of each taxa across communities
p.m <- apply(t(otu.table), 2, mean)
p.m <- p.m[p.m != 0]
p <- p.m/N
# Calculate the occurrence frequency of each taxa across communities
otu.table.bi <- 1*(t(otu.table)>0)
freq <- apply(otu.table.bi, 2, mean)
freq <- freq[freq != 0]
# Combine
freq.abd.df = data.frame(p=p, freq=freq) %>%
tibble::rownames_to_column(var="OTU") %>%
filter(p != 0, freq != 0) %>%
arrange(p)
# For each pair of samples run the RCbray analysis
comps = combn(alpha.df$Sample_ID, m=2, simplify = F)
RCb.df = data.frame(Site1 = character(), Site2 = character(), RCb = numeric(), stringsAsFactors = F)
for (j in seq(1, length(comps))){
sam = comps[[j]]
alpha1 = alpha.df[alpha.df$Sample_ID == sam[1],]$OTU.n
alpha2 = alpha.df[alpha.df$Sample_ID == sam[2],]$OTU.n
# Permute "reps" many times
rep.list = seq(1, reps)
null.list = mclapply(rep.list, RCbray_null_func, freq.abd.df=freq.abd.df, alpha1=alpha1, alpha2=alpha2, N=N, mc.cores=nproc)
RCb = (length(null.list[null.list > beta.table[sam[1], sam[2]]]) + (0.5*length(null.list[null.list == beta.table[sam[1], sam[2]]])))/reps
RCb = (RCb - 0.5)*2
RCb.df = rbind(RCb.df, data.frame(Site1=sam[1], Site2=sam[2], RCb=RCb, stringsAsFactors = F))
}
RCb.df
return(RCb.df)
}
# 运行RCbray的计算,这个运算再5个小时左右999重复
# Run the RCbray calculation, this operation will be repeated 999 times in about 5 hours
RCb = Calc_RCbray(psrare, num, thread)
head(RCb)
return(list(RCb))
}
ps = readRDS("./data/dataNEW/ps_16s.rds")
psphy = filter_taxa(ps, function(x) sum(x ) > 1000 , TRUE);psphy
phyloseq-class experiment-level object
otu_table() OTU Table: [ 597 taxa and 18 samples ]
sample_data() Sample Data: [ 18 samples by 2 sample variables ]
tax_table() Taxonomy Table: [ 597 taxa by 7 taxonomic ranks ]
phy_tree() Phylogenetic Tree: [ 597 tips and 595 internal nodes ]
refseq() DNAStringSet: [ 597 reference sequences ]
result = RCbary(ps = psphy ,group = "Group",num = 10,thread = 1)
RCbary = result[[1]]
head(RCbary)
Site1 Site2 RCb
1 sample1 sample10 1
2 sample1 sample11 1
3 sample1 sample12 1
4 sample1 sample13 1
5 sample1 sample14 1
6 sample1 sample15 1
filename = paste(phypath,"/5_RCb.csv",sep = "")
write.csv(RCbary,filename)
bNTIRCPlot = function(otu = NULL,tax = NULL,
map = NULL,tree = NULL ,
ps = NULL,
RCb = RCb,bNTI = bNTI,group = "Group"){
ps = inputMicro(otu,tax,map,tree,ps,group = group)
ps
psrare <- ps
map = as.data.frame(sample_data(psrare))
map$ID = row.names(map)
sample_data(psrare) = map
# Get habitat metadata and add it to the βNTI then merge with the RCbray dataset
eco.meta1 = data.frame(sample_data(psrare)) %>%
select(ID, Group) %>%
dplyr::rename(Sample_1 = ID, Group_1 = Group)
eco.meta2=data.frame(sample_data(psrare)) %>%
select(ID, Group) %>%
dplyr::rename(Sample_2 = ID, Group_2 = Group)
# bNTI 匹配第一列和第二列的分组信息(bNTI matches the grouping information of the first and second columns)
bNTI.df = inner_join(bNTI, eco.meta1) %>%
inner_join(eco.meta2)
# 合并两个数据(Merge two data)
turnover.df = inner_join(bNTI.df, RCb)
head(turnover.df)
dim(turnover.df)
#--------------合并文件保存(Merge file and save)
# write.csv(turnover.df,"./Result/bNTI//bNTI_RCbray.csv")
#-----按照分组统计作图(Plotting by group statistics)
#------------bNIT作图(bNIT plotting)
dim(bNTI.df)
within.bNTI.df = bNTI.df %>%
filter(Group_1 == Group_2) %>%
mutate(Group = Group_1)
head(within.bNTI.df )
eco.bNTI.plot <- ggplot(within.bNTI.df, aes(x=Group, y=bNTI)) +
geom_jitter(alpha = 0.1,color ="#984EA3") +
geom_boxplot(outlier.shape=1,outlier.alpha = 0,fill = "#984EA3") +
geom_hline(yintercept = 2, linetype=2, size=0.5) +
geom_hline(yintercept = -2, linetype=2, size=0.5) +
labs(x="", y="bNTI") +
theme_classic() +
theme(legend.position = "none",
axis.text = element_text(size=12),
axis.text.x = element_text(angle=45, hjust=1),
axis.title = element_text(size=14))
# 现在按照RCbray进行分开标记系统发育过程
# Now follow RCbray to separate the marker phylogeny
eco.turnover.df = turnover.df %>%
filter(Group_1 == Group_2) %>%
mutate(Group = Group_1)
head(eco.turnover.df )
## Calculate the relative influence of each process
eco.turnover.df = eco.turnover.df %>%
mutate(process = ifelse(abs(bNTI) < 2,
ifelse(abs(RCb) < 0.95, "Drift",
ifelse(RCb >= 0.95, "Dispersal Limited",
ifelse(RCb <= -0.95, "Homogenizing Dispersal", "ERROR"))),
ifelse(bNTI >= 2, "Variable Selection",
ifelse(bNTI <= -2, "Homogeneous Selection", "ERROR"))))
eco.turnover.df$process = factor(eco.turnover.df$process, levels = c("Drift",
"Dispersal Limited", "Homogenizing Dispersal",
"Variable Selection", "Homogeneous Selection"))
head(eco.turnover.df)
#------计算每个组的系统发育过程中五个部分分别占有的比例(Calculate the proportion of each group in the phylogeny of the five parts)
pre = eco.turnover.df %>%
dplyr::group_by(Group, process) %>%
dplyr::summarize(n_sites = n(),
perc=(n()/45)*100) %>%
as.data.frame
# head(numeco )
numeco <- pre %>% dplyr::group_by(Group) %>%
dplyr::summarise(num = sum(n_sites))
alleco <- pre %>% dplyr::left_join(numeco,by = "Group")
alleco$perc = alleco$n_sites/ alleco$num * 100
sum.eco.turnover.df = alleco
eco.turnover.plot = ggplot(sum.eco.turnover.df, aes(x=Group, y=perc, fill=process)) +
geom_bar(stat="identity", color="black") +
# scale_fill_manual(values = c("white", "grey75", "grey50", "black")) +
labs(x="", y="Percent of pairs (%)", fill="Process") +
theme_bw() +
theme(panel.grid = element_blank(),
axis.text = element_text(size=12),
axis.text.x = element_text(angle=45, hjust=1),
axis.title = element_text(size=14),
legend.key.size = unit(10, "mm"),
legend.text = element_text(size=12),
legend.title = element_text(size=14))
eco.turnover.plot
# Merge the plots
eco.plot = cowplot::plot_grid(eco.bNTI.plot, eco.turnover.plot,
rel_widths=c(0.6, 1), labels=c("A", "B"))
eco.plot
return(list( eco.bNTI.plot, eco.turnover.plot,eco.plot,turnover.df,sum.eco.turnover.df))
}
bNTI = read.csv(paste(phypath,"/4_bNTI.csv",sep = ""),row.names = 1)
head(bNTI)
Sample_1 Sample_2 bMNTD mean_bMNTD sd_bMNTD bNTI
1 sample10 sample1 0.004675 0.003640 0.001402 0.738
2 sample11 sample1 0.007456 0.006143 0.002386 0.550
3 sample12 sample1 0.002150 0.001382 0.000688 1.117
4 sample13 sample1 0.002822 0.001092 0.000873 1.980
5 sample14 sample1 0.001703 0.001007 0.000783 0.889
6 sample15 sample1 0.000923 0.000493 0.000330 1.304
# RCbray 数据读入,修改列名(RCbray data read in, modify column names)
RCb = read.csv(paste(phypath,"/5_RCb.csv",sep = ""),row.names = 1) %>%
dplyr::mutate(Sample_1 = Site2, Sample_2 = Site1)
head(RCb)
Site1 Site2 RCb Sample_1 Sample_2
1 sample1 sample10 1 sample10 sample1
2 sample1 sample11 1 sample11 sample1
3 sample1 sample12 1 sample12 sample1
4 sample1 sample13 1 sample13 sample1
5 sample1 sample14 1 sample14 sample1
6 sample1 sample15 1 sample15 sample1
result = bNTIRCPlot(ps = psphy ,RCb = RCb,bNTI = bNTI,group = "Group")
#--bNTI出图(bNTI out of the plot)
p3 <- result[[1]]
p3
#RCbary可视化(RCbary Visualization)
p4 <- result[[2]]
p4
#组合图片BNTI,RCbray(Combined images BNTI, RCbray)
p5 <- result[[3]]
p5
plotdata = result[[4]]
head(plotdata)
Sample_1 Sample_2 bMNTD mean_bMNTD sd_bMNTD bNTI Group_1 Group_2 Site1
1 sample10 sample1 0.004675 0.003640 0.001402 0.738 Group2 Group1 sample1
2 sample11 sample1 0.007456 0.006143 0.002386 0.550 Group2 Group1 sample1
3 sample12 sample1 0.002150 0.001382 0.000688 1.117 Group2 Group1 sample1
4 sample13 sample1 0.002822 0.001092 0.000873 1.980 Group3 Group1 sample1
5 sample14 sample1 0.001703 0.001007 0.000783 0.889 Group3 Group1 sample1
6 sample15 sample1 0.000923 0.000493 0.000330 1.304 Group3 Group1 sample1
Site2 RCb
1 sample10 1
2 sample11 1
3 sample12 1
4 sample13 1
5 sample14 1
6 sample15 1
dat = result[[5]]
head(dat)
Group process n_sites perc num
1 Group1 Dispersal Limited 12 80.00 15
2 Group1 Variable Selection 3 20.00 15
3 Group2 Dispersal Limited 15 100.00 15
4 Group3 Dispersal Limited 14 93.33 15
5 Group3 Variable Selection 1 6.67 15
filename = paste(phypath,"/6_bNTI_RCbray.csv",sep = "")
write.csv(plotdata,filename)
FileName <- paste(phypath,"6_bNTI", ".pdf", sep = "")
ggsave(FileName, p3,width =8,height = 6)
FileName <- paste(phypath,"6_RCbary", ".pdf", sep = "")
ggsave(FileName, p4,width = 6,height = 6)
FileName <- paste(phypath,"6_BNTI_RCbray", ".pdf", sep = "")
ggsave(FileName, p5,width = 12,height = 8)
FileName <- paste(phypath,"6_bNTI", ".png", sep = "")
ggsave(FileName, p3,width =8,height = 6)
FileName <- paste(phypath,"6_RCbary", ".png", sep = "")
ggsave(FileName, p4,width = 6,height = 6)
FileName <- paste(phypath,"6_BNTI_RCbray", ".png", sep = "")
ggsave(FileName, p5,width = 12,height = 8)
FileName <- paste(phypath,"6_RCbray.percent.csv", sep = "")
write.csv(dat,FileName, quote = F)
EnvCorbNTI = function(otu = NULL,
tax = NULL,
map = NULL,
tree = NULL,
ps = NULL,
bNTIRC = RCbNTI,
env = env,
group = "Group"){
ps = inputMicro(otu,tax,map,tree,ps,group = group)
ps
#------------定义相关性分析函数(Define the correlation analysis function)
# df = data
Sams.mantel.test = function(df, seed=NULL) {
# Run mantel test to see if there is a correlation
delta.mat = df %>%
select(Sample_1, Sample_2, delta) %>%
spread(Sample_2, delta)
rownames(delta.mat) = delta.mat$Sample_1
delta.mat$Sample_1 = NULL
delta.mat = delta.mat[names(sort(rowSums(!is.na(delta.mat)), decreasing = FALSE)), names(sort(colSums(!is.na(delta.mat)), decreasing = TRUE))]
delta.mat = as.dist(delta.mat)
bNTI.mat = df %>%
select(Sample_1, Sample_2, bNTI) %>%
spread(Sample_2, bNTI)
rownames(bNTI.mat) = bNTI.mat$Sample_1
bNTI.mat$Sample_1 = NULL
bNTI.mat = bNTI.mat[names(sort(rowSums(!is.na(bNTI.mat)), decreasing = FALSE)), names(sort(colSums(!is.na(bNTI.mat)), decreasing = TRUE))]
bNTI.mat = as.dist(bNTI.mat)
if (!(is.null(seed))){
set.seed(seed)
}
bNTI.mat[is.na(bNTI.mat)] = 0
mantel.res = vegan::mantel(delta.mat, bNTI.mat)
return(mantel.res)
}
set.seed(72) # setting seed for reproducibility
psrare = rarefy_even_depth(ps)
# 检查序列数量(Check the number of sequences)
sample_sums(psrare)
# 标准化数据(Standardized data)
ps.norm = transform_sample_counts(psrare, function(x) x/sum(x))
map = as.data.frame(sample_data(psrare))
# map = data.frame(row.names = map$id,id = map$id,Group = map$Group)
mapE =merge(map,env,by = "row.names",all= FALSE)
row.names(mapE) = mapE$Row.names
mapE$Row.names = NULL
mapE$ID = row.names(mapE)
head(mapE)
#---------合并环境变量数据(Merge environment variable data)
# i = "Altitude..m."
plot = list()
for (i in colnames(env)) {
colnames(mapE) = gsub(i,"XX",colnames(mapE))
# Add in pH metadata
pH.meta1=mapE %>%
dplyr::select(ID, XX) %>%
dplyr::rename(Sample_1 = ID, env1_1 = XX)
pH.meta2= mapE%>%
dplyr::select(ID, XX) %>%
dplyr::rename(Sample_2 = ID, env1_2 = XX)
data = dplyr::inner_join(bNTIRC, pH.meta1) %>%
dplyr::inner_join(pH.meta2) %>%
dplyr::mutate(delta = abs(env1_1-env1_2),
crosstype = ifelse(Group_1 == Group_2, as.character(Group_1), "across"))
head(data)
data$crosstype
# Run mantel test to see if there is a correlation
pH.mantel = Sams.mantel.test(data, seed=72)
head(data)
# Plot
p = ggplot(data, aes(x=delta, y=bNTI)) +
geom_point(pch = 21) +
# scale_shape_manual(values=LandUse.shapes) +
geom_hline(yintercept = 2, linetype=2) +
geom_hline(yintercept = -2, linetype=2) +
# annotate("text", x=3.25, y=12.5, label=paste("r= ", round(pH.mantel$statistic, 3), "\n", "p= ", round(pH.mantel$signif, 3), sep="")) +
labs(x=paste("",i), y="βNTI",title = paste("r= ", round(pH.mantel$statistic, 3), "p= ", round(pH.mantel$signif, 3))) +
theme(legend.position = "none") +theme_bw()
p
plot[[i]] = p
colnames(mapE) = gsub("XX",i,colnames(mapE))
}
library(ggpubr)
p = ggarrange(plotlist = plot, common.legend = TRUE, legend="right")
p
return(list(p,plot))
}
Sams.mantel.test = function(df, seed=NULL) {
# Run mantel test to see if there is a correlation
delta.mat = df %>%
select(Sample_1, Sample_2, delta) %>%
spread(Sample_2, delta)
rownames(delta.mat) = delta.mat$Sample_1
delta.mat$Sample_1 = NULL
delta.mat = delta.mat[names(sort(rowSums(!is.na(delta.mat)), decreasing = F)), names(sort(colSums(!is.na(delta.mat)), decreasing = T))]
delta.mat = as.dist(delta.mat)
bNTI.mat = df %>%
select(Sample_1, Sample_2, bNTI) %>%
spread(Sample_2, bNTI)
rownames(bNTI.mat) = bNTI.mat$Sample_1
bNTI.mat$Sample_1 = NULL
bNTI.mat = bNTI.mat[names(sort(rowSums(!is.na(bNTI.mat)), decreasing = F)), names(sort(colSums(!is.na(bNTI.mat)), decreasing = T))]
bNTI.mat = as.dist(bNTI.mat)
if (!(is.null(seed))){
set.seed(seed)
}
bNTI.mat[is.na(bNTI.mat)] = 0
mantel.res = vegan::mantel(delta.mat, bNTI.mat)
return(mantel.res)
}
# df = data
Sams.mantel.test = function(df, seed=NULL) {
# Run mantel test to see if there is a correlation
delta.mat = df %>%
select(sample1, sample2, delta) %>%
spread(sample2, delta)
rownames(delta.mat) = delta.mat$sample1
delta.mat$sample1 = NULL
delta.mat = delta.mat[names(sort(rowSums(!is.na(delta.mat)), decreasing = F)), names(sort(colSums(!is.na(delta.mat)), decreasing = T))]
delta.mat = as.dist(delta.mat)
bNTI.mat = df %>%
select(sample1, sample2, bNTI) %>%
spread(sample2, bNTI)
rownames(bNTI.mat) = bNTI.mat$sample1
bNTI.mat$sample1 = NULL
bNTI.mat = bNTI.mat[names(sort(rowSums(!is.na(bNTI.mat)), decreasing = F)), names(sort(colSums(!is.na(bNTI.mat)), decreasing = T))]
bNTI.mat = as.dist(bNTI.mat)
if (!(is.null(seed))){
set.seed(seed)
}
bNTI.mat[is.na(bNTI.mat)] = 0
mantel.res = vegan::mantel(delta.mat, bNTI.mat)
return(mantel.res)
}
#-导入bNTI函数(Import bNTI functions)
bNTIRC = read.csv(paste(phypath,"/6_bNTI_RCbray.csv",sep = ""),row.names = 1)
head(bNTIRC)
Sample_1 Sample_2 bMNTD mean_bMNTD sd_bMNTD bNTI Group_1 Group_2 Site1
1 sample10 sample1 0.004675 0.003640 0.001402 0.738 Group2 Group1 sample1
2 sample11 sample1 0.007456 0.006143 0.002386 0.550 Group2 Group1 sample1
3 sample12 sample1 0.002150 0.001382 0.000688 1.117 Group2 Group1 sample1
4 sample13 sample1 0.002822 0.001092 0.000873 1.980 Group3 Group1 sample1
5 sample14 sample1 0.001703 0.001007 0.000783 0.889 Group3 Group1 sample1
6 sample15 sample1 0.000923 0.000493 0.000330 1.304 Group3 Group1 sample1
Site2 RCb
1 sample10 1
2 sample11 1
3 sample12 1
4 sample13 1
5 sample14 1
6 sample15 1
map = sample_data(psphy)
# head(map)
plot = EnvCorbNTI(ps = psphy,
bNTIRC = bNTIRC,
group = "Group",
env = envRDA
)
## 提取相关分析结果,总图(Extract relevant analysis results, the overall picture)
p6 <- plot[[1]]
p6
# 提取单个(single)
# plot[[2]][1]
FileName <- paste(phypath,"7_env_corWithBNTI", ".pdf", sep = "")
ggsave(FileName, p6,width = 16,height = 14)
FileName <- paste(phypath,"7_env_corWithBNTI", ".png", sep = "")
ggsave(FileName, p6,width = 16,height = 14)